예제 #1
0
def get_aggcontext_window(
    window,
    *,
    scope,
    operand,
    operand_dtype,
    parent,
    group_by,
    order_by,
    **kwargs,
) -> AggregationContext:
    # no order by or group by: default summarization aggcontext
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming

    if not group_by and not order_by:
        aggcontext = agg_ctx.Summarize()
    elif (
        isinstance(
            operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)
        )
        and order_by
    ):
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            max_lookback = window.max_lookback
            assert not isinstance(operand.op(), ops.CumulativeOp)
            aggcontext = agg_ctx.Moving(
                preceding,
                max_lookback,
                parent=parent,
                group_by=group_by,
                order_by=order_by,
                dtype=operand_dtype,
            )
        else:
            # expanding window
            aggcontext = agg_ctx.Cumulative(
                parent=parent,
                group_by=group_by,
                order_by=order_by,
                dtype=operand_dtype,
            )
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        aggcontext = agg_ctx.Transform(
            parent=parent,
            group_by=group_by,
            order_by=order_by,
            dtype=operand_dtype,
        )

    return aggcontext
예제 #2
0
def execute_window_op(op,
                      data,
                      window,
                      scope=None,
                      aggcontext=None,
                      clients=None,
                      **kwargs):
    operand = op.expr
    # pre execute "manually" here because otherwise we wouldn't pickup
    # relevant scope changes from the child operand since we're managing
    # execution of that by hand
    operand_op = operand.op()
    pre_executed_scope = pre_execute(operand_op,
                                     *clients,
                                     scope=scope,
                                     aggcontext=aggcontext,
                                     **kwargs)
    scope = toolz.merge(scope, pre_executed_scope)

    root, = op.root_tables()
    root_expr = root.to_expr()
    data = execute(
        root_expr,
        scope=scope,
        clients=clients,
        aggcontext=aggcontext,
        **kwargs,
    )

    following = window.following
    order_by = window._order_by

    if (order_by and following != 0
            and not isinstance(operand_op, ops.ShiftBase)):
        raise com.OperationNotDefinedError(
            'Window functions affected by following with order_by are not '
            'implemented')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else execute(
            key, aggcontext=aggcontext, **kwargs) for key, key_op in zip(
                group_by, map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    if not order_by:
        ordering_keys = ()

    if group_by:
        if order_by:
            (
                sorted_df,
                grouping_keys,
                ordering_keys,
            ) = util.compute_sorted_frame(data,
                                          order_by,
                                          group_by=group_by,
                                          **kwargs)
            source = sorted_df.groupby(grouping_keys, sort=True)
            post_process = _post_process_group_by_order_by
        else:
            source = data.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by
    else:
        if order_by:
            source, grouping_keys, ordering_keys = util.compute_sorted_frame(
                data, order_by, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # figure out what the dtype of the operand is
    operand_type = operand.type()
    operand_dtype = operand_type.to_pandas()

    # no order by or group by: default summarization aggcontext
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not ordering_keys:
        aggcontext = agg_ctx.Summarize()
    elif (isinstance(operand.op(),
                     (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All))
          and ordering_keys):
        # XXX(phillipc): What a horror show
        preceding = window.preceding
        if preceding is not None:
            max_lookback = window.max_lookback
            assert not isinstance(operand.op(), ops.CumulativeOp)
            aggcontext = agg_ctx.Moving(
                preceding,
                max_lookback,
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
        else:
            # expanding window
            aggcontext = agg_ctx.Cumulative(
                parent=source,
                group_by=grouping_keys,
                order_by=ordering_keys,
                dtype=operand_dtype,
            )
    else:
        # groupby transform (window with a partition by clause in SQL parlance)
        aggcontext = agg_ctx.Transform(
            parent=source,
            group_by=grouping_keys,
            order_by=ordering_keys,
            dtype=operand_dtype,
        )

    result = execute(
        operand,
        scope=new_scope,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    series = post_process(result, data, ordering_keys, grouping_keys)
    assert len(data) == len(
        series
    ), 'input data source and computed column do not have the same length'
    return series
예제 #3
0
파일: window.py 프로젝트: sestus/ibis
def execute_frame_window_op(op, data, scope=None, context=None, **kwargs):
    operand, window = op.args

    following = window.following
    order_by = window._order_by

    if order_by and following != 0:
        raise ValueError(
            'Following with a value other than 0 (current row) with order_by '
            'is not yet implemented in the pandas backend. Use '
            'ibis.trailing_window or ibis.cumulative_window to '
            'construct windows when using the pandas backend.')

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ir.TableColumn) else execute(
            key, context=context, **kwargs) for key, key_op in zip(
                group_by, map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by

    if grouping_keys:
        source = data.groupby(grouping_keys, sort=False, as_index=not order_by)

        if order_by:
            sorted_df = source.apply(
                lambda df, order_by=order_by, kwargs=kwargs:
                (util.compute_sorted_frame(order_by, df, **kwargs)))
            source = sorted_df.groupby(grouping_keys, sort=False)
            post_process = _post_process_group_by_order_by
        else:
            post_process = _post_process_group_by
    else:
        if order_by:
            source = util.compute_sorted_frame(order_by, data, **kwargs)
            post_process = _post_process_order_by
        else:
            source = data
            post_process = _post_process_empty

    new_scope = toolz.merge(
        scope,
        OrderedDict((t, source) for t in operand.op().root_tables()),
        factory=OrderedDict,
    )

    # no order by or group by: default summarization context
    #
    # if we're reducing and we have an order by expression then we need to
    # expand or roll.
    #
    # otherwise we're transforming
    if not grouping_keys and not order_by:
        context = agg_ctx.Summarize()
    elif isinstance(operand.op(), ops.Reduction) and order_by:
        preceding = window.preceding
        if preceding is not None:
            context = agg_ctx.Trailing(preceding)
        else:
            context = agg_ctx.Cumulative()
    else:
        context = agg_ctx.Transform()

    result = execute(operand, new_scope, context=context, **kwargs)
    series = post_process(result, data.index)
    assert len(data) == len(series), \
        'input data source and computed column do not have the same length'
    return series