def _extract_groupby( self, df: dd.DataFrame, window: org.apache.calcite.rel.core.Window.Group, dc: DataContainer, context: "dask_sql.Context", ) -> Tuple[dd.DataFrame, str]: """Prepare grouping columns we can later use while applying the main function""" partition_keys = list(window.keys) if partition_keys: group_columns = [ df[dc.column_container.get_backend_by_frontend_index(o)] for o in partition_keys ] group_columns = get_groupby_with_nulls_cols(df, group_columns) group_columns = { new_temporary_column(df): group_col for group_col in group_columns } else: group_columns = {new_temporary_column(df): 1} df = df.assign(**group_columns) group_columns = list(group_columns.keys()) return df, group_columns
def get_groupby_with_nulls_cols(df: dd.DataFrame, group_columns: List[str], additional_column_name: str = None): """ SQL and dask are treating null columns a bit different: SQL will put them to the front, dask will just ignore them Therefore we use the same trick as fugue does: we will group by both the NaN and the real column value """ if additional_column_name is None: additional_column_name = new_temporary_column(df) group_columns_and_nulls = [] for group_column in group_columns: # the ~ makes NaN come first is_null_column = ~(group_column.isnull()) non_nan_group_column = group_column.fillna(0) group_columns_and_nulls += [is_null_column, non_nan_group_column] if not group_columns_and_nulls: # This can happen in statements like # SELECT SUM(x) FROM data # without any groupby statement group_columns_and_nulls = [additional_column_name] return group_columns_and_nulls
def _perform_aggregation( self, df: dd.DataFrame, filter_column: str, aggregations: List[Tuple[str, str, Any]], additional_column_name: str, group_columns: List[str], groupby_agg_options: Dict[str, Any] = {}, ): tmp_df = df # format aggregations for Dask; also check if we can use fast path for # groupby, which is only supported if we are not using any custom aggregations # and our pandas version support dropna for groupbys aggregations_dict = defaultdict(dict) fast_groupby = True for aggregation in aggregations: input_col, output_col, aggregation_f = aggregation aggregations_dict[input_col][output_col] = aggregation_f if not isinstance(aggregation_f, str): fast_groupby = False # filter dataframe if specified if filter_column: filter_expression = tmp_df[filter_column] tmp_df = tmp_df[filter_expression] logger.debug(f"Filtered by {filter_column} before aggregation.") # we might need a temporary column name if no groupby columns are specified if additional_column_name is None: additional_column_name = new_temporary_column(df) # perform groupby operation; if we are using custom aggreagations, we must handle # null values manually (this is slow) if fast_groupby: grouped_df = tmp_df.groupby(by=(group_columns or [additional_column_name]), dropna=False) else: group_columns = [ tmp_df[group_column] for group_column in group_columns ] group_columns_and_nulls = get_groupby_with_nulls_cols( tmp_df, group_columns, additional_column_name) grouped_df = tmp_df.groupby(by=group_columns_and_nulls) # apply the aggregation(s) logger.debug(f"Performing aggregation {dict(aggregations_dict)}") agg_result = grouped_df.agg(aggregations_dict, **groupby_agg_options) # fix the column names to a single level agg_result.columns = agg_result.columns.get_level_values(-1) return agg_result
def _extract_operations( self, window: org.apache.calcite.rel.core.Window.Group, df: dd.DataFrame, dc: DataContainer, context: "dask_sql.Context", ) -> List[Tuple[Callable, str, List[str]]]: # Finally apply the actual function on each group separately operations = [] for agg_call in window.aggCalls: operator = agg_call.getOperator() operator_name = str(operator.getName()) operator_name = operator_name.lower() try: operation = self.OPERATION_MAPPING[operator_name] except KeyError: # pragma: no cover try: operation = context.schema[ context.schema_name].functions[operator_name] except KeyError: # pragma: no cover raise NotImplementedError( f"{operator_name} not (yet) implemented") logger.debug( f"Executing {operator_name} on {str(LoggableDataFrame(df))}") # TODO: can be optimized by re-using already present columns temporary_operand_columns = { new_temporary_column(df): RexConverter.convert(o, dc, context=context) for o in agg_call.getOperands() } df = df.assign(**temporary_operand_columns) temporary_operand_columns = list(temporary_operand_columns.keys()) operations.append((operation, new_temporary_column(df), temporary_operand_columns)) return operations, df
def _preserve_index_and_sort( self, df: dd.DataFrame ) -> Tuple[dd.DataFrame, str, str, str]: """Store the partition number, index and sort order separately to make any shuffling reversible""" partition_col, index_col, sort_col = ( new_temporary_column(df), new_temporary_column(df), new_temporary_column(df), ) def store_index_columns(partition, partition_index): return partition.assign( **{ partition_col: partition_index, index_col: partition.index, sort_col: range(len(partition)), } ) df = map_on_partition_index(df, store_index_columns) return df, partition_col, index_col, sort_col
def convert( self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context" ) -> DataContainer: # Get the input of the previous step (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # Collect all (new) columns named_projects = rel.getNamedProjects() column_names = [] new_columns = {} new_mappings = {} for expr, key in named_projects: key = str(key) column_names.append(key) # shortcut: if we have a column already, there is no need to re-assign it again # this is only the case if the expr is a RexInputRef if isinstance(expr, org.apache.calcite.rex.RexInputRef): index = expr.getIndex() backend_column_name = cc.get_backend_by_frontend_index(index) logger.debug( f"Not re-adding the same column {key} (but just referencing it)" ) new_mappings[key] = backend_column_name else: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( expr, dc, context=context ) logger.debug(f"Adding a new column {key} out of {expr}") new_mappings[key] = random_name # Actually add the new columns if new_columns: df = df.assign(**new_columns) # and the new mappings for key, backend_column_name in new_mappings.items(): cc = cc.add(key, backend_column_name) # Make sure the order is correct cc = cc.limit_to(column_names) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def _extract_groupby( self, df: dd.DataFrame, window: org.apache.calcite.rex.RexWindow, dc: DataContainer, context: "dask_sql.Context", ) -> Tuple[dd.DataFrame, str]: """Prepare grouping columns we can later use while applying the main function""" partition_keys = list(window.partitionKeys) if partition_keys: group_columns = [ RexConverter.convert(o, dc, context=context) for o in partition_keys ] group_columns = get_groupby_with_nulls_cols(df, group_columns) group_columns = { new_temporary_column(df): group_col for group_col in group_columns } else: group_columns = {new_temporary_column(df): 1} df = df.assign(**group_columns) group_columns = list(group_columns.keys()) return df, group_columns
def _do_aggregations( self, rel: "org.apache.calcite.rel.RelNode", dc: DataContainer, group_columns: List[str], context: "dask_sql.Context", ) -> Tuple[dd.DataFrame, List[str]]: """ Main functionality: return the result dataframe and the output column order """ df = dc.df cc = dc.column_container # We might need it later. # If not, lets hope that adding a single column should not # be a huge problem... additional_column_name = new_temporary_column(df) df = df.assign(**{additional_column_name: 1}) # Add an entry for every grouped column, as SQL wants them first output_column_order = group_columns.copy() # Collect all aggregations we need to do collected_aggregations, output_column_order = self._collect_aggregations( rel, df, cc, context, additional_column_name, output_column_order ) if not collected_aggregations: return df[group_columns].drop_duplicates(), output_column_order # SQL needs to have a column with the grouped values as the first # output column. # As the values of the group columns # are the same for a single group anyways, we just use the first row for col in group_columns: collected_aggregations[None].append((col, col, "first")) # Now we can go ahead and use these grouped aggregations # to perform the actual aggregation # It is very important to start with the non-filtered entry. # Otherwise we might loose some entries in the grouped columns df_result = None key = None if key in collected_aggregations: aggregations = collected_aggregations.pop(key) df_result = self._perform_aggregation( df, None, aggregations, additional_column_name, group_columns, ) # Now we can also the the rest for filter_column, aggregations in collected_aggregations.items(): agg_result = self._perform_aggregation( df, filter_column, aggregations, additional_column_name, group_columns, ) # ... and finally concat the new data with the already present columns if df_result is None: df_result = agg_result else: df_result = df_result.assign( **{col: agg_result[col] for col in agg_result.columns} ) return df_result, output_column_order
def _collect_aggregations( self, rel: "org.apache.calcite.rel.RelNode", df: dd.DataFrame, cc: ColumnContainer, context: "dask_sql.Context", additional_column_name: str, output_column_order: List[str], ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str], dd.DataFrame]: """ Collect all aggregations together, which have the same filter column so that the aggregations only need to be done once. Returns the aggregations as mapping filter_column -> List of Aggregations where the aggregations are in the form (input_col, output_col, aggregation function (or string)) """ collected_aggregations = defaultdict(list) for agg_call in rel.getNamedAggCalls(): expr = agg_call.getKey() # Find out which aggregation function to use schema_name, aggregation_name = context.fqn( expr.getAggregation().getNameAsId()) aggregation_name = aggregation_name.lower() # Find out about the input column inputs = expr.getArgList() if aggregation_name == "regr_count": is_null = IsNullOperation() two_columns_proxy = new_temporary_column(df) if len(inputs) == 1: # calcite some times gives one input/col to regr_count and # another col has filter column col1 = cc.get_backend_by_frontend_index(inputs[0]) df = df.assign(**{two_columns_proxy: (~is_null(df[col1]))}) else: col1 = cc.get_backend_by_frontend_index(inputs[0]) col2 = cc.get_backend_by_frontend_index(inputs[1]) # both cols should be not null df = df.assign( **{ two_columns_proxy: (~is_null(df[col1]) & (~is_null(df[col2]))) }) input_col = two_columns_proxy elif len(inputs) == 1: input_col = cc.get_backend_by_frontend_index(inputs[0]) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError( "Can not cope with more than one input") # Extract flags (filtering/distinct) if expr.isDistinct(): # pragma: no cover raise ValueError("Apache Calcite should optimize them away!") filter_column = None if expr.hasFilter(): filter_column = cc.get_backend_by_frontend_index( expr.filterArg) try: aggregation_function = self.AGGREGATION_MAPPING[ aggregation_name] except KeyError: try: aggregation_function = context.schema[ schema_name].functions[aggregation_name] except KeyError: # pragma: no cover raise NotImplementedError( f"Aggregation function {aggregation_name} not implemented (yet)." ) if isinstance(aggregation_function, AggregationSpecification): aggregation_function = aggregation_function.get_supported_aggregation( df[input_col]) # Finally, extract the output column name output_col = str(agg_call.getValue()) # Store the aggregation key = filter_column value = (input_col, output_col, aggregation_function) collected_aggregations[key].append(value) output_column_order.append(output_col) return collected_aggregations, output_column_order, df
def _apply_function_over( self, df: dd.DataFrame, f: Callable, operands: List[dd.Series], window: org.apache.calcite.rex.RexWindow, group_columns: List[str], sort_columns: List[str], sort_ascending: List[bool], sort_null_first: List[bool], ) -> Tuple[dd.DataFrame, str]: """Apply the given function over the dataframe, possibly grouped and sorted per group""" temporary_operand_columns = { new_temporary_column(df): operand for operand in operands } df = df.assign(**temporary_operand_columns) # Important: move as few bytes as possible to the pickled function, # which is evaluated on the workers temporary_operand_columns = temporary_operand_columns.keys() # Extract the window definition lower_bound = to_bound_description(window.getLowerBound()) upper_bound = to_bound_description(window.getUpperBound()) new_column_name = new_temporary_column(df) @make_pickable_without_dask_sql def map_on_each_group(partitioned_group): # Apply sorting if sort_columns: partitioned_group = sort_partition_func( partitioned_group, sort_columns, sort_ascending, sort_null_first ) if f is None: # This is the row_number operator. # We do not need to do any windowing column_result = range(1, len(partitioned_group) + 1) else: # In all other cases, apply the windowing operation if lower_bound.is_unbounded and ( upper_bound.is_current_row or upper_bound.offset == 0 ): windowed_group = partitioned_group.expanding(min_periods=0) elif lower_bound.is_preceding and ( upper_bound.is_current_row or upper_bound.offset == 0 ): windowed_group = partitioned_group.rolling( window=lower_bound.offset + 1, min_periods=0, ) else: lower_offset = ( lower_bound.offset if not lower_bound.is_current_row else 0 ) if lower_bound.is_preceding and lower_offset is not None: lower_offset *= -1 upper_offset = ( upper_bound.offset if not upper_bound.is_current_row else 0 ) if upper_bound.is_preceding and upper_offset is not None: upper_offset *= -1 indexer = Indexer(lower_offset, upper_offset) windowed_group = partitioned_group.rolling( window=indexer, min_periods=0 ) column_result = f(windowed_group, *temporary_operand_columns) partitioned_group = partitioned_group.assign( **{new_column_name: column_result} ) return partitioned_group # Currently, pandas will always return a float for windowing operations meta = df._meta_nonempty.assign(**{new_column_name: 0.0}) df = df.groupby(group_columns).apply(map_on_each_group, meta=meta) return df, new_column_name