def test_window_with_mlb(): index = pd.date_range('20170501', '20170507') data = np.random.randn(len(index), 3) df = ( pd.DataFrame(data, columns=list('abc'), index=index) .rename_axis('time') .reset_index(drop=False) ) client = Backend().connect({'df': df}) t = client.table('df') rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10)) expr = t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b') ) ) result = expr.execute() expected = df.set_index('time') gb_df = ( expected.groupby(['b'])['a'] .rolling('10d', closed='both') .apply(lambda s: s.iloc[-5:].sum(), raw=False) .sort_index(level=['time']) .reset_index(drop=True) ) expected = expected.reset_index(drop=False).assign(sum=gb_df) tm.assert_frame_equal(result, expected) rows_with_mlb = rows_with_max_lookback(5, 10) with pytest.raises(com.IbisInputError): t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time') ) )
def test_udaf_groupby(): df = pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), }) con = Backend().connect({'df': df}) t = con.table('df') expr = t.groupby(t.key).aggregate(my_corr=my_corr(t.a, t.b)) assert isinstance(expr, ir.TableExpr) result = expr.execute().sort_values('key') dfi = df.set_index('key') expected = pd.DataFrame({ 'key': list('def'), 'my_corr': [dfi.loc[value, 'a'].corr(dfi.loc[value, 'b']) for value in 'def'], }) columns = ['key', 'my_corr'] tm.assert_frame_equal(result[columns], expected[columns])
def test_udaf_window_interval(): df = pd.DataFrame( collections.OrderedDict([ ( "time", pd.date_range(start='20190105', end='20190101', freq='-1D'), ), ("key", [1, 2, 1, 2, 1]), ("value", np.arange(5)), ])) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_range_window(ibis.interval(days=2), order_by='time', group_by='key') expr = t.mutate(rolled=my_mean(t.value).over(window)) result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True) expected = (df.sort_values(['time', 'key']).set_index('time').assign( rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both'). mean().reset_index(level=0, drop=True))).reset_index(drop=False) tm.assert_frame_equal(result, expected)
def test_nullif_inf(): df = pd.DataFrame({'a': [np.inf, 3.14, -np.inf, 42.0]}) con = Backend().connect({'t': df}) t = con.table('t') expr = t.a.nullif(np.inf).nullif(-np.inf) result = expr.execute() expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name='a') tm.assert_series_equal(result, expected)
def test_project_list_scalar(): df = pd.DataFrame({'ints': range(3)}) con = Backend().connect({'df': df}) expr = con.table('df') result = expr.mutate(res=expr.ints.quantile([0.5, 0.95])).execute() tm.assert_series_equal( result.res, pd.Series([[1.0, 1.9] for _ in range(0, 3)], name='res') )
def test_interval_arithmetic(op, expected): data = pd.timedelta_range('0 days', '10 days', freq='D') con = Backend().connect({ 'df1': pd.DataFrame({'td': data}), 'df2': pd.DataFrame({'td': data}) }) t1 = con.table('df1') expr = op(t1.td, t1.td) result = expr.execute() expected = pd.Series(expected(data, data), name='td') tm.assert_series_equal(result, expected)
def test_multiple_argument_udaf_window(): # PR 2035 @udf.reduction(['double', 'double'], 'double') def my_wm(v, w): return np.average(v, weights=w) df = pd.DataFrame( { 'a': np.arange(4, 0, dtype=float, step=-1).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'c': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'd': np.repeat(1, 7), 'key': list('deefefd'), } ) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_window(2, order_by='a', group_by='key') window2 = ibis.trailing_window(1, order_by='b', group_by='key') expr = t.mutate( wm_b=my_wm(t.b, t.d).over(window), wm_c=my_wm(t.c, t.d).over(window), wm_c2=my_wm(t.c, t.d).over(window2), ) result = expr.execute().sort_values(['key', 'a']) expected = ( df.sort_values(['key', 'a']) .assign( wm_b=lambda df: df.groupby('key') .b.rolling(3, min_periods=1) .mean() .reset_index(level=0, drop=True) ) .assign( wm_c=lambda df: df.groupby('key') .c.rolling(3, min_periods=1) .mean() .reset_index(level=0, drop=True) ) ) expected = expected.sort_values(['key', 'b']).assign( wm_c2=lambda df: df.groupby('key') .c.rolling(2, min_periods=1) .mean() .reset_index(level=0, drop=True) ) expected = expected.sort_values(['key', 'a']) tm.assert_frame_equal(result, expected)
def test_select_on_unambiguous_join(how, func): df_t = pd.DataFrame({'a0': [1, 2, 3], 'b1': list("aab")}) df_s = pd.DataFrame({'a1': [2, 3, 4], 'b2': list("abc")}) con = Backend().connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") method = getattr(t, f"{how}_join") join = method(s, t.b1 == s.b2) expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"], how=how)[["a0", "a1"]] assert not expected.empty expr = func(join) result = expr.execute() tm.assert_frame_equal(result, expected)
def test_udaf_window_nan(): df = pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), }) con = Backend().connect({'df': df}) t = con.table('df') window = ibis.trailing_window(2, order_by='a', group_by='key') expr = t.mutate(rolled=my_mean(t.b).over(window)) result = expr.execute().sort_values(['key', 'a']) expected = df.sort_values(['key', 'a']).assign( rolled=lambda d: d.groupby('key').b.rolling(3, min_periods=1).apply( lambda x: x.mean(), raw=True).reset_index(level=0, drop=True)) tm.assert_frame_equal(result, expected)
def test_select_on_unambiguous_asof_join(func): df_t = pd.DataFrame({ 'a0': [1, 2, 3], 'b1': pd.date_range("20180101", periods=3) }) df_s = pd.DataFrame({ 'a1': [2, 3, 4], 'b2': pd.date_range("20171230", periods=3) }) con = Backend().connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") join = t.asof_join(s, t.b1 == s.b2) expected = pd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] assert not expected.empty expr = func(join) result = expr.execute() tm.assert_frame_equal(result, expected)
def test_window_with_preceding_expr(index): time = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(time)) df = pd.DataFrame({'value': data, 'time': time}, index=index(time)) client = Backend().connect({'df': df}) t = client.table('df') expected = ( df.set_index('time') .value.rolling('3d', closed='both') .mean() .reset_index(drop=True) ) expected.index.name = None day = ibis.interval(days=1) window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
def test_window_has_pre_execute_scope(): signature = ops.Lag, Backend called = [0] @pre_execute.register(*signature) def test_pre_execute(op, client, **kwargs): called[0] += 1 return Scope() data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')} df = pd.DataFrame(data, columns=['key', 'value', 'dup']) client = Backend().connect({'df': df}) t = client.table('df') window = ibis.window(order_by='value') expr = t.key.lag(1).over(window).name('foo') result = expr.execute() assert result is not None # once in window op at the top to pickup any scope changes before computing # twice in window op when calling execute on the ops.Lag node at the # beginning of execute and once before the actual computation assert called[0] == 3