Exemplo n.º 1
0
    def setup(self):
        n = 30 * int(2e5)
        data = pd.DataFrame(
            {
                'key': np.random.choice(16000, size=n),
                'low_card_key': np.random.choice(30, size=n),
                'value': np.random.rand(n),
                'timestamps': pd.date_range(
                    start='now', periods=n, freq='s'
                ).values,
                'timestamp_strings': pd.date_range(
                    start='now', periods=n, freq='s'
                ).values.astype(str),
                'repeated_timestamps': pd.date_range(
                    start='2018-09-01', periods=30
                ).repeat(int(n / 30)),
            }
        )

        t = ibis.pandas.connect({'df': data}).table('df')

        self.high_card_group_by = t.groupby(t.key).aggregate(
            avg_value=t.value.mean()
        )

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (
            t.mutate(dates=t.timestamps.cast('date'))
            .groupby(['low_card_key', 'dates'])
            .aggregate(avg_value=lambda t: t.value.mean())
        )

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[
            ['low_card_key', 'key', 'value']
        ].sort_by(['low_card_key', 'key'])

        low_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

        high_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
Exemplo n.º 2
0
    def setup(self):
        n = 30 * int(2e5)
        data = pd.DataFrame({
            'key':
            np.random.choice(16000, size=n),
            'low_card_key':
            np.random.choice(30, size=n),
            'value':
            np.random.rand(n),
            'timestamps':
            pd.date_range(start='now', periods=n, freq='s').values,
            'timestamp_strings':
            pd.date_range(start='now', periods=n, freq='s').values.astype(str),
            'repeated_timestamps':
            pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)),
        })

        t = ibis.pandas.connect({'df': data}).table('df')

        self.high_card_group_by = t.groupby(
            t.key).aggregate(avg_value=t.value.mean())

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (t.mutate(
            dates=t.timestamps.cast('date')).groupby(
                ['low_card_key',
                 'dates']).aggregate(avg_value=lambda t: t.value.mean()))

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[['low_card_key', 'key', 'value'
                                           ]].sort_by(['low_card_key', 'key'])

        low_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(low_card_window)

        high_card_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
Exemplo n.º 3
0
def test_trailing_range_window_unsupported(alltypes, preceding, value):
    t = alltypes
    w = ibis.trailing_range_window(preceding=preceding,
                                   order_by=t.timestamp_col)
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
    with pytest.raises(ValueError):
        expr.compile()
Exemplo n.º 4
0
def test_udaf_window_interval():
    df = pd.DataFrame(
        collections.OrderedDict([
            (
                "time",
                pd.date_range(start='20190105', end='20190101', freq='-1D'),
            ),
            ("key", [1, 2, 1, 2, 1]),
            ("value", np.arange(5)),
        ]))

    con = connect({'df': df})
    t = con.table('df')
    window = ibis.trailing_range_window(ibis.interval(days=2),
                                        order_by='time',
                                        group_by='key')

    expr = t.mutate(rolled=my_mean(t.value).over(window))

    result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True)
    expected = (df.sort_values(['time', 'key']).set_index('time').assign(
        rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both').
        mean().reset_index(level=0, drop=True))).reset_index(drop=False)

    tm.assert_frame_equal(result, expected)
Exemplo n.º 5
0
def test_combine_window_with_interval_offset(alltypes):
    t = alltypes
    w1 = ibis.trailing_range_window(preceding=ibis.interval(days=3),
                                    order_by=t.e)
    w2 = ibis.trailing_range_window(preceding=ibis.interval(days=4),
                                    order_by=t.f)
    w3 = w1.combine(w2)
    expected = ibis.trailing_range_window(preceding=ibis.interval(days=3),
                                          order_by=[t.e, t.f])
    assert_equal(w3, expected)

    w4 = ibis.range_window(following=ibis.interval(days=5), order_by=t.e)
    w5 = ibis.range_window(following=ibis.interval(days=7), order_by=t.f)
    expected = ibis.range_window(following=ibis.interval(days=5),
                                 order_by=[t.e, t.f])
    w6 = w4.combine(w5)
    assert_equal(w6, expected)
Exemplo n.º 6
0
def test_trailing_range_window_unsupported(alltypes, preceding, value):
    t = alltypes
    w = ibis.trailing_range_window(
        preceding=preceding, order_by=t.timestamp_col
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
    with pytest.raises(ValueError):
        expr.compile()
Exemplo n.º 7
0
def test_trailing_range_window_unsupported(alltypes, preceding, value):
    if IBIS_VERSION <= IBIS_1_VERSION:
        pytest.skip("requires ibis 2.x")
    t = alltypes
    w = ibis.trailing_range_window(preceding=preceding,
                                   order_by=t.timestamp_col)
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
    with pytest.raises(ValueError):
        expr.compile()
Exemplo n.º 8
0
def test_trailing_range_window(alltypes, preceding, value, project_id):
    t = alltypes
    w = ibis.trailing_range_window(preceding=preceding,
                                   order_by=t.timestamp_col)
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
    result = expr.compile()
    expected = f"""\
SELECT *,
       avg(`float_col`) OVER (ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN {value} PRECEDING AND CURRENT ROW) AS `win_avg`
FROM `{project_id}.testing.functional_alltypes`"""  # noqa: E501
    assert result == expected
Exemplo n.º 9
0
def test_trailing_range_window(alltypes, preceding, value, project_id):
    t = alltypes
    w = ibis.trailing_range_window(
        preceding=preceding, order_by=t.timestamp_col
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w))
    result = expr.compile()
    expected = """\
SELECT *,
       avg(`float_col`) OVER (ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN {} PRECEDING AND CURRENT ROW) AS `win_avg`
FROM `{}.testing.functional_alltypes`""".format(  # noqa: E501
        value, project_id
    )
    assert result == expected
Exemplo n.º 10
0
    def setup(self):
        n = 30 * int(2e5)
        self.data = pd.DataFrame({
            'key':
            np.random.choice(16000, size=n),
            'low_card_key':
            np.random.choice(30, size=n),
            'value':
            np.random.rand(n),
            'timestamps':
            pd.date_range(start='now', periods=n, freq='s').values,
            'timestamp_strings':
            pd.date_range(start='now', periods=n, freq='s').values.astype(str),
            'repeated_timestamps':
            pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)),
        })

        t = ibis.pandas.connect({'df': self.data}).table('df')

        self.high_card_group_by = t.groupby(
            t.key).aggregate(avg_value=t.value.mean())

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (t.mutate(
            dates=t.timestamps.cast('date')).groupby(
                ['low_card_key',
                 'dates']).aggregate(avg_value=lambda t: t.value.mean()))

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[['low_card_key', 'key', 'value'
                                           ]].sort_by(['low_card_key', 'key'])

        low_card_rolling_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(
            low_card_rolling_window)

        high_card_rolling_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(
            high_card_rolling_window)

        @udf.reduction(['double'], 'double')
        def my_mean(series):
            return series.mean()

        self.low_card_grouped_rolling_udf_mean = my_mean(
            t.value).over(low_card_rolling_window)
        self.high_card_grouped_rolling_udf_mean = my_mean(
            t.value).over(high_card_rolling_window)

        @udf.analytic(['double'], 'double')
        def my_zscore(series):
            return (series - series.mean()) / series.std()

        low_card_window = ibis.window(group_by=t.low_card_key)

        high_card_window = ibis.window(group_by=t.key)

        self.low_card_window_analytics_udf = my_zscore(
            t.value).over(low_card_window)
        self.high_card_window_analytics_udf = my_zscore(
            t.value).over(high_card_window)

        @udf.reduction(['double', 'double'], 'double')
        def my_wm(v, w):
            return np.average(v, weights=w)

        self.low_card_grouped_rolling_udf_wm = my_wm(
            t.value, t.value).over(low_card_rolling_window)

        self.high_card_grouped_rolling_udf_wm = my_wm(
            t.value, t.value).over(low_card_rolling_window)
Exemplo n.º 11
0
def high_card_rolling_window(t):
    return ibis.trailing_range_window(
        ibis.interval(days=2),
        order_by=t.repeated_timestamps,
        group_by=t.key,
    )