def compute_projection_scalar_expr( expr, parent, data, scope: Scope = None, timecontext: Optional[TimeContext] = None, **kwargs, ): name = expr.get_name() assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) if scope is None: scope = Scope() scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) scalar = execute(expr, scope=scope, **kwargs) result = pd.Series([scalar], name=name).repeat(len(data.index)) result.index = data.index return result
def execute_and_reset( expr, params=None, scope=None, timecontext: Optional[TimeContext] = None, aggcontext=None, **kwargs, ): """Execute an expression against data that are bound to it. If no data are bound, raise an Exception. Notes ----- The difference between this function and :func:`~ibis.dask.core.execute` is that this function resets the index of the result, if the result has an index. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute params : Mapping[ibis.expr.types.Expr, object] The data that an unbound parameter in `expr` maps to scope : Mapping[ibis.expr.operations.Node, object] Additional scope, mapping ibis operations to data timecontext : Optional[TimeContext] timecontext needed for execution aggcontext : Optional[ibis.dask.aggcontext.AggregationContext] An object indicating how to compute aggregations. For example, a rolling mean needs to be computed differently than the mean of a column. kwargs : Dict[str, object] Additional arguments that can potentially be used by individual node execution Returns ------- result : Union[ dask.dataframe.Series, dask.dataframe.DataFrame, ibis.dask.core.simple_types ] Raises ------ ValueError * If no data are bound to the input expression """ result = execute( expr, params=params, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) if isinstance(result, dd.DataFrame): schema = expr.schema() df = result.reset_index() return df[schema.names] elif isinstance(result, dd.Series): return result.reset_index(drop=True) return result
def dask_execute_node_expr_list(op, sequence, **kwargs): if all(type(s) != dd.Series for s in sequence): execute_node_expr_list(op, sequence, **kwargs) columns = [e.get_name() for e in op.exprs] schema = ibis.schema(list(zip(columns, (e.type() for e in op.exprs)))) data = {col: [execute(el, **kwargs)] for col, el in zip(columns, sequence)} return schema.apply_to( dd.from_pandas(pd.DataFrame(data, columns=columns), npartitions=1))
def insert( self, path, key, expr, format='table', data_columns=True, **kwargs ): path = self.root / path data = execute(expr) data.to_hdf( str(path), key, format=format, data_columns=data_columns, **kwargs )
def _compute_predicates( table_op, predicates, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.Column] data : pd.DataFrame scope : Scope timecontext: Optional[TimeContext] kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes data_columns = frozenset(data.columns) additional_scope = Scope() for root_table in root_tables: mapping = remap_overlapping_column_names(table_op, root_table, data_columns) new_data = map_new_column_names_to_data(mapping, data) additional_scope = additional_scope.merge_scope( Scope({root_table: new_data}, timecontext)) scope = scope.merge_scope(additional_scope) yield execute(predicate, scope=scope, **kwargs)
def compute_sort_key(key, data, timecontext, scope=None, **kwargs): by = key.to_expr() try: if isinstance(by, str): return by, None return by.get_name(), None except com.ExpressionError: if scope is None: scope = Scope() scope = scope.merge_scopes( Scope({t: data}, timecontext) for t in by.op().root_tables()) new_column = execute(by, scope=scope, **kwargs) name = ibis.util.guid() new_column.name = name return name, new_column
def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = expr._safe_name op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name assert isinstance(name, str) if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) suffix = util.get_join_suffix_for_op(op, parent_table_op) return data.loc[:, name + suffix].rename(result_name or name) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) result = coerce_to_output( execute(expr, scope=scope, timecontext=timecontext, **kwargs), expr, data.index, ) return result
def insert(self, path, expr, **kwargs): path = self.root / path df = execute(expr) table = pa.Table.from_pandas(df) pq.write_table(table, str(path))
def insert(self, path, expr, index=False, **kwargs): path = self.root / path data = execute(expr) data.to_csv(str(path), index=index, **kwargs)
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): if window.how == "range" and any( not isinstance(ob.type(), (dt.Time, dt.Date, dt.Timestamp)) for ob in window._order_by): raise NotImplementedError( "The pandas backend only implements range windows with temporal " "ordering keys") operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context(op, timecontext=timecontext, clients=clients, scope=scope) # timecontext is the original time context required by parent node # of this Window, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) if scope is None: scope = pre_executed_scope else: scope = scope.merge_scope(pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = [] post_process: Callable[ [Any, pd.DataFrame, List[str], List[str], Optional[TimeContext]], pd.Series, ] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) aggcontext = get_aggcontext( window, scope=scope, operand=operand, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) result = post_process( result, data, ordering_keys, grouping_keys, adjusted_timecontext, ) assert len(data) == len( result ), 'input data source and computed column do not have the same length' # trim data to original time context result = trim_window_result(result, timecontext) return result
def execute_node_value_list(op, _, **kwargs): return [execute(arg, **kwargs) for arg in op.values]
def execute_alias(op, _, **kwargs): # just compile the underlying argument because the naming is handled # by the translator for the top level expression return execute(op.arg, **kwargs)
def execute_aggregation_dataframe( op, data, metrics, by, having, predicates, sort_keys, scope=None, timecontext: Optional[TimeContext] = None, **kwargs, ): assert metrics, 'no metrics found during aggregation execution' if sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, timecontext=timecontext, **kwargs) for p in predicates), ) data = data.loc[predicate] columns: Dict[str, str] = {} if op.by: grouping_key_pairs = list(zip(by, map(operator.methodcaller('op'), by))) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute( by, scope=scope, timecontext=timecontext, **kwargs).rename( by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data scope = scope.merge_scope(Scope({op.table.op(): source}, timecontext)) pieces = [ coerce_to_output( execute(metric, scope=scope, timecontext=timecontext, **kwargs), metric, ) for metric in metrics ] result = pd.concat(pieces, axis=1) # If grouping, need a reset to get the grouping key back as a column if by: result = result.reset_index() result.columns = [columns.get(c, c) for c in result.columns] if having: # .having(...) is only accessible on groupby, so this should never # raise if not by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key') # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, (execute(h, scope=scope, timecontext=timecontext, **kwargs) for h in having), ) assert len(predicate) == len( result), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result
def execute_table_array_view(op, _, **kwargs): return execute(op.table).squeeze()