예제 #1
0
    def _extract_ordering(self,
                          window: org.apache.calcite.rel.core.Window.Group,
                          cc: ColumnContainer) -> Tuple[str, str, str]:
        """Prepare sorting information we can later use while applying the main function"""
        order_keys = list(window.orderKeys.getFieldCollations())
        sort_columns_indices = [int(i.getFieldIndex()) for i in order_keys]
        sort_columns = [
            cc.get_backend_by_frontend_index(i) for i in sort_columns_indices
        ]

        ASCENDING = org.apache.calcite.rel.RelFieldCollation.Direction.ASCENDING
        FIRST = org.apache.calcite.rel.RelFieldCollation.NullDirection.FIRST
        sort_ascending = [x.getDirection() == ASCENDING for x in order_keys]
        sort_null_first = [x.nullDirection == FIRST for x in order_keys]

        return sort_columns, sort_ascending, sort_null_first
예제 #2
0
    def _collect_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        df: dd.DataFrame,
        cc: ColumnContainer,
        context: "dask_sql.Context",
        additional_column_name: str,
        output_column_order: List[str],
    ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str]]:
        """
        Collect all aggregations together, which have the same filter column
        so that the aggregations only need to be done once.

        Returns the aggregations as mapping filter_column -> List of Aggregations
        where the aggregations are in the form (input_col, output_col, aggregation function (or string))
        """
        collected_aggregations = defaultdict(list)

        for agg_call in rel.getNamedAggCalls():
            expr = agg_call.getKey()

            # Find out about the input column
            inputs = expr.getArgList()
            if len(inputs) == 1:
                input_col = cc.get_backend_by_frontend_index(inputs[0])
            elif len(inputs) == 0:
                input_col = additional_column_name
            else:
                raise NotImplementedError("Can not cope with more than one input")

            # Extract flags (filtering/distinct)
            if expr.isDistinct():  # pragma: no cover
                raise ValueError("Apache Calcite should optimize them away!")

            filter_column = None
            if expr.hasFilter():
                filter_column = cc.get_backend_by_frontend_index(expr.filterArg)

            # Find out which aggregation function to use
            aggregation_name = str(expr.getAggregation().getName())
            aggregation_name = aggregation_name.lower()
            try:
                aggregation_function = self.AGGREGATION_MAPPING[aggregation_name]
            except KeyError:
                try:
                    aggregation_function = context.functions[aggregation_name]
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"Aggregation function {aggregation_name} not implemented (yet)."
                    )
            if isinstance(aggregation_function, AggregationSpecification):
                dtype = df[input_col].dtype
                if pd.api.types.is_numeric_dtype(dtype):
                    aggregation_function = aggregation_function.numerical_aggregation
                else:
                    aggregation_function = (
                        aggregation_function.non_numerical_aggregation
                    )

            # Finally, extract the output column name
            output_col = str(agg_call.getValue())

            # Store the aggregation
            key = filter_column
            value = (input_col, output_col, aggregation_function)
            collected_aggregations[key].append(value)
            output_column_order.append(output_col)

        return collected_aggregations, output_column_order
예제 #3
0
    def _collect_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        df: dd.DataFrame,
        cc: ColumnContainer,
        context: "dask_sql.Context",
        additional_column_name: str,
        output_column_order: List[str],
    ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str],
               dd.DataFrame]:
        """
        Collect all aggregations together, which have the same filter column
        so that the aggregations only need to be done once.

        Returns the aggregations as mapping filter_column -> List of Aggregations
        where the aggregations are in the form (input_col, output_col, aggregation function (or string))
        """
        collected_aggregations = defaultdict(list)

        for agg_call in rel.getNamedAggCalls():
            expr = agg_call.getKey()
            # Find out which aggregation function to use
            schema_name, aggregation_name = context.fqn(
                expr.getAggregation().getNameAsId())
            aggregation_name = aggregation_name.lower()
            # Find out about the input column
            inputs = expr.getArgList()
            if aggregation_name == "regr_count":
                is_null = IsNullOperation()
                two_columns_proxy = new_temporary_column(df)
                if len(inputs) == 1:
                    # calcite some times gives one input/col to regr_count and
                    # another col has filter column
                    col1 = cc.get_backend_by_frontend_index(inputs[0])
                    df = df.assign(**{two_columns_proxy: (~is_null(df[col1]))})

                else:
                    col1 = cc.get_backend_by_frontend_index(inputs[0])
                    col2 = cc.get_backend_by_frontend_index(inputs[1])
                    # both cols should be not null
                    df = df.assign(
                        **{
                            two_columns_proxy: (~is_null(df[col1])
                                                & (~is_null(df[col2])))
                        })
                input_col = two_columns_proxy
            elif len(inputs) == 1:
                input_col = cc.get_backend_by_frontend_index(inputs[0])
            elif len(inputs) == 0:
                input_col = additional_column_name
            else:
                raise NotImplementedError(
                    "Can not cope with more than one input")

            # Extract flags (filtering/distinct)
            if expr.isDistinct():  # pragma: no cover
                raise ValueError("Apache Calcite should optimize them away!")

            filter_column = None
            if expr.hasFilter():
                filter_column = cc.get_backend_by_frontend_index(
                    expr.filterArg)

            try:
                aggregation_function = self.AGGREGATION_MAPPING[
                    aggregation_name]
            except KeyError:
                try:
                    aggregation_function = context.schema[
                        schema_name].functions[aggregation_name]
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"Aggregation function {aggregation_name} not implemented (yet)."
                    )
            if isinstance(aggregation_function, AggregationSpecification):
                aggregation_function = aggregation_function.get_supported_aggregation(
                    df[input_col])

            # Finally, extract the output column name
            output_col = str(agg_call.getValue())

            # Store the aggregation
            key = filter_column
            value = (input_col, output_col, aggregation_function)
            collected_aggregations[key].append(value)
            output_column_order.append(output_col)

        return collected_aggregations, output_column_order, df