def _extract_ordering(self, window: org.apache.calcite.rel.core.Window.Group, cc: ColumnContainer) -> Tuple[str, str, str]: """Prepare sorting information we can later use while applying the main function""" order_keys = list(window.orderKeys.getFieldCollations()) sort_columns_indices = [int(i.getFieldIndex()) for i in order_keys] sort_columns = [ cc.get_backend_by_frontend_index(i) for i in sort_columns_indices ] ASCENDING = org.apache.calcite.rel.RelFieldCollation.Direction.ASCENDING FIRST = org.apache.calcite.rel.RelFieldCollation.NullDirection.FIRST sort_ascending = [x.getDirection() == ASCENDING for x in order_keys] sort_null_first = [x.nullDirection == FIRST for x in order_keys] return sort_columns, sort_ascending, sort_null_first
def _collect_aggregations( self, rel: "org.apache.calcite.rel.RelNode", df: dd.DataFrame, cc: ColumnContainer, context: "dask_sql.Context", additional_column_name: str, output_column_order: List[str], ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str]]: """ Collect all aggregations together, which have the same filter column so that the aggregations only need to be done once. Returns the aggregations as mapping filter_column -> List of Aggregations where the aggregations are in the form (input_col, output_col, aggregation function (or string)) """ collected_aggregations = defaultdict(list) for agg_call in rel.getNamedAggCalls(): expr = agg_call.getKey() # Find out about the input column inputs = expr.getArgList() if len(inputs) == 1: input_col = cc.get_backend_by_frontend_index(inputs[0]) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError("Can not cope with more than one input") # Extract flags (filtering/distinct) if expr.isDistinct(): # pragma: no cover raise ValueError("Apache Calcite should optimize them away!") filter_column = None if expr.hasFilter(): filter_column = cc.get_backend_by_frontend_index(expr.filterArg) # Find out which aggregation function to use aggregation_name = str(expr.getAggregation().getName()) aggregation_name = aggregation_name.lower() try: aggregation_function = self.AGGREGATION_MAPPING[aggregation_name] except KeyError: try: aggregation_function = context.functions[aggregation_name] except KeyError: # pragma: no cover raise NotImplementedError( f"Aggregation function {aggregation_name} not implemented (yet)." ) if isinstance(aggregation_function, AggregationSpecification): dtype = df[input_col].dtype if pd.api.types.is_numeric_dtype(dtype): aggregation_function = aggregation_function.numerical_aggregation else: aggregation_function = ( aggregation_function.non_numerical_aggregation ) # Finally, extract the output column name output_col = str(agg_call.getValue()) # Store the aggregation key = filter_column value = (input_col, output_col, aggregation_function) collected_aggregations[key].append(value) output_column_order.append(output_col) return collected_aggregations, output_column_order
def _collect_aggregations( self, rel: "org.apache.calcite.rel.RelNode", df: dd.DataFrame, cc: ColumnContainer, context: "dask_sql.Context", additional_column_name: str, output_column_order: List[str], ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str], dd.DataFrame]: """ Collect all aggregations together, which have the same filter column so that the aggregations only need to be done once. Returns the aggregations as mapping filter_column -> List of Aggregations where the aggregations are in the form (input_col, output_col, aggregation function (or string)) """ collected_aggregations = defaultdict(list) for agg_call in rel.getNamedAggCalls(): expr = agg_call.getKey() # Find out which aggregation function to use schema_name, aggregation_name = context.fqn( expr.getAggregation().getNameAsId()) aggregation_name = aggregation_name.lower() # Find out about the input column inputs = expr.getArgList() if aggregation_name == "regr_count": is_null = IsNullOperation() two_columns_proxy = new_temporary_column(df) if len(inputs) == 1: # calcite some times gives one input/col to regr_count and # another col has filter column col1 = cc.get_backend_by_frontend_index(inputs[0]) df = df.assign(**{two_columns_proxy: (~is_null(df[col1]))}) else: col1 = cc.get_backend_by_frontend_index(inputs[0]) col2 = cc.get_backend_by_frontend_index(inputs[1]) # both cols should be not null df = df.assign( **{ two_columns_proxy: (~is_null(df[col1]) & (~is_null(df[col2]))) }) input_col = two_columns_proxy elif len(inputs) == 1: input_col = cc.get_backend_by_frontend_index(inputs[0]) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError( "Can not cope with more than one input") # Extract flags (filtering/distinct) if expr.isDistinct(): # pragma: no cover raise ValueError("Apache Calcite should optimize them away!") filter_column = None if expr.hasFilter(): filter_column = cc.get_backend_by_frontend_index( expr.filterArg) try: aggregation_function = self.AGGREGATION_MAPPING[ aggregation_name] except KeyError: try: aggregation_function = context.schema[ schema_name].functions[aggregation_name] except KeyError: # pragma: no cover raise NotImplementedError( f"Aggregation function {aggregation_name} not implemented (yet)." ) if isinstance(aggregation_function, AggregationSpecification): aggregation_function = aggregation_function.get_supported_aggregation( df[input_col]) # Finally, extract the output column name output_col = str(agg_call.getValue()) # Store the aggregation key = filter_column value = (input_col, output_col, aggregation_function) collected_aggregations[key].append(value) output_column_order.append(output_col) return collected_aggregations, output_column_order, df