def build_df_from_projection( selection_exprs: List[ir.Expr], op: ops.Selection, data: dd.DataFrame, **kwargs, ) -> dd.DataFrame: """ Build up a df from individual pieces by dispatching to `compute_projection` for each expression. """ # Fast path for when we're assigning columns into the same table. if (selection_exprs[0] is op.table) and all( is_row_order_preserving(selection_exprs[1:])): for expr in selection_exprs[1:]: projection = compute_projection(expr, op, data, **kwargs) if isinstance(projection, dd.Series): data = data.assign(**{projection.name: projection}) else: data = data.assign( **{c: projection[c] for c in projection.columns}) return data # Slow path when we cannot do direct assigns # Create a unique row identifier and set it as the index. This is # used in dd.concat to merge the pieces back together. data = add_partitioned_sorted_column(data) data_pieces = [ compute_projection(expr, op, data, **kwargs) for expr in selection_exprs ] return dd.concat(data_pieces, axis=1).reset_index(drop=True)
def build_df_from_projection(selections: List[ir.Expr], op: ops.Selection, data: dd.DataFrame, **kwargs) -> dd.DataFrame: """ Build up a df from individual pieces by dispatching to `compute_projection` for each expression. """ # Create a unique row identifier and set it as the index. This is # used in dd.concat to merge the pieces back together. data = add_partitioned_sorted_column(data) data_pieces = [ compute_projection(s, op, data, **kwargs) for s in selections ] return dd.concat(data_pieces, axis=1).reset_index(drop=True)
def execute_selection_dataframe( op, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs ): selections = op.selections predicates = op.predicates sort_keys = op.sort_keys result = data # Build up the individual dask structures from column expressions if selections: data_pieces = [] for selection in selections: dask_object = compute_projection( selection, op, data, scope=scope, timecontext=timecontext, **kwargs, ) data_pieces.append(dask_object) result = dd.concat(data_pieces, axis=1) if predicates: predicates = _compute_predicates( op.table.op(), predicates, data, scope, timecontext, **kwargs ) predicate = functools.reduce(operator.and_, predicates) assert len(predicate) == len( result ), 'Selection predicate length does not match underlying table' result = result.loc[predicate] if sort_keys: if len(sort_keys) > 1: raise NotImplementedError( """ Multi-key sorting is not implemented for the Dask backend """ ) sort_key = sort_keys[0] ascending = getattr(sort_key.op(), 'ascending', True) if not ascending: raise NotImplementedError( "Descending sort is not supported for the Dask backend" ) result = compute_sorted_frame( result, order_by=sort_key, scope=scope, timecontext=timecontext, **kwargs, ) return result else: grouping_keys = ordering_keys = () # return early if we do not have any temporary grouping or ordering columns assert not grouping_keys, 'group by should never show up in Selection' if not ordering_keys: return result # create a sequence of columns that we need to drop temporary_columns = pandas.Index( concatv(grouping_keys, ordering_keys) ).difference(data.columns) # no reason to call drop if we don't need to if temporary_columns.empty: return result # drop every temporary column we created for ordering or grouping return result.drop(temporary_columns, axis=1)