Пример #1
0
def _post_process_empty(
    result: Any,
    parent: pd.DataFrame,
    order_by: List[str],
    group_by: List[str],
    timecontext: Optional[TimeContext],
) -> pd.Series:
    # This is the post process of the no groupby nor orderby window
    # `result` could be a Series or a scalar. generated by `agg` method
    # of class `Window`. For window without grouby or orderby, `agg`
    # calls pands method directly. So if timecontext is present, we
    # need to insert 'time' column into index for trimming the result.
    # For cases when grouby or orderby is present, `agg` calls
    # Ibis method `window_agg_built_in` and `window_agg_udf`, time
    # context is already inserted there.
    assert not order_by and not group_by
    if isinstance(result, pd.Series):
        # `result` is a Series when an analytic operation is being
        # applied over the window, since analytic operations are N->N
        if timecontext:
            result = construct_time_context_aware_series(result, parent)
        return result
    else:
        # `result` is a scalar when a reduction operation is being
        # applied over the window, since reduction operations are N->1
        index = parent.index
        result = pd.Series([result]).repeat(len(index))
        result.index = index
        if timecontext:
            result = construct_time_context_aware_series(result, parent)
        return result
Пример #2
0
def test_construct_time_context_aware_series(time_df3):
    """Unit test for `construct_time_context_aware_series`
    """
    # Series without 'time' index will result in a MultiIndex with 'time'
    df = time_df3
    expected = df['value']
    time_index = pd.Index(df['time'])
    expected.index = pd.MultiIndex.from_arrays(
        [expected.index, time_index],
        names=expected.index.names + ['time'],
    )
    result = construct_time_context_aware_series(df['value'], df)
    tm.assert_series_equal(result, expected)

    # Series with 'time' as index will not change
    time_indexed_df = time_df3.set_index('time')
    expected_time_aware = time_indexed_df['value']
    result_time_aware = construct_time_context_aware_series(
        time_indexed_df['value'], time_indexed_df)
    tm.assert_series_equal(result_time_aware, expected_time_aware)

    # Series with a MultiIndex, where 'time' is in the MultiIndex,
    # will not change
    multi_index_time_aware_series = result_time_aware
    expected_multi_index_time_aware = result_time_aware
    result_multi_index_time_aware = construct_time_context_aware_series(
        multi_index_time_aware_series, time_indexed_df)
    tm.assert_series_equal(result_multi_index_time_aware,
                           expected_multi_index_time_aware)

    # Series with a MultiIndex, where 'time' is NOT in the MultiIndex,
    # 'time' will be added into the MultiIndex
    multi_index_series = df['id']
    expected_multi_index = df['id'].copy()
    other_index = pd.Index(df['value'])
    expected_multi_index.index = pd.MultiIndex.from_arrays(
        [expected_multi_index.index, other_index, time_index],
        names=expected_multi_index.index.names + ['value', 'time'],
    )
    multi_index_series.index = pd.MultiIndex.from_arrays(
        [multi_index_series.index, other_index],
        names=multi_index_series.index.names + ['value'],
    )
    result_multi_index = construct_time_context_aware_series(
        multi_index_series, df)
    tm.assert_series_equal(result_multi_index, expected_multi_index)
Пример #3
0
    def agg(
        self,
        grouped_data: Union[pd.Series, SeriesGroupBy],
        function: Union[str, Callable],
        *args: Any,
        **kwargs: Any,
    ) -> pd.Series:
        # avoid a pandas warning about numpy arrays being passed through
        # directly
        group_by = self.group_by
        order_by = self.order_by

        assert group_by or order_by

        # Get the DataFrame from which the operand originated
        # (passed in when constructing this context object in
        # execute_node(ops.WindowOp))
        parent = self.parent
        frame = getattr(parent, 'obj', parent)
        obj = getattr(grouped_data, 'obj', grouped_data)
        name = obj.name
        if frame[name] is not obj or name in group_by or name in order_by:
            name = f"{name}_{ibis.util.guid()}"
            frame = frame.assign(**{name: obj})

        # set the index to our order_by keys and append it to the existing
        # index
        # TODO: see if we can do this in the caller, when the context
        # is constructed rather than pulling out the data
        columns = group_by + order_by + [name]
        # Create a new frame to avoid mutating the original one
        indexed_by_ordering = frame[columns].copy()
        # placeholder column to compute window_sizes below
        indexed_by_ordering['_placeholder'] = 0
        indexed_by_ordering = indexed_by_ordering.set_index(
            order_by
        ).sort_index(kind="stable")

        # regroup if needed
        if group_by:
            grouped_frame = indexed_by_ordering.groupby(group_by)
        else:
            grouped_frame = indexed_by_ordering
        grouped = grouped_frame[name]

        if callable(function):
            # To compute the window_size, we need to contruct a
            # RollingGroupby and compute count using construct_window.
            # However, if the RollingGroupby is not numeric, e.g.,
            # we are calling window UDF on a timestamp column, we
            # cannot compute rolling count directly because:
            # (1) windowed.count() will exclude NaN observations
            #     , which results in incorrect window sizes.
            # (2) windowed.apply(len, raw=True) will include NaN
            #     obversations, but doesn't work on non-numeric types.
            #     https://github.com/pandas-dev/pandas/issues/23002
            # To deal with this, we create a _placeholder column

            windowed_frame = self.construct_window(grouped_frame)
            window_sizes = (
                windowed_frame['_placeholder'].count().reset_index(drop=True)
            )
            mask = ~(window_sizes.isna())
            window_upper_indices = pd.Series(range(len(window_sizes))) + 1
            window_lower_indices = window_upper_indices - window_sizes
            # The result Series of udf may need to be trimmed by
            # timecontext. In order to do so, 'time' must be added
            # as an index to the Series, if present. Here We extract
            # time column from the parent Dataframe `frame`.
            if get_time_col() in frame:
                result_index = construct_time_context_aware_series(
                    obj, frame
                ).index
            else:
                result_index = obj.index
            result = window_agg_udf(
                grouped_data,
                function,
                window_lower_indices,
                window_upper_indices,
                mask,
                result_index,
                self.dtype,
                self.max_lookback,
                *args,
                **kwargs,
            )
        else:
            # perform the per-group rolling operation
            windowed = self.construct_window(grouped)
            result = window_agg_built_in(
                frame,
                windowed,
                function,
                self.max_lookback,
                *args,
                **kwargs,
            )
        try:
            return result.astype(self.dtype, copy=False)
        except (TypeError, ValueError):
            return result