def test_combine_window_with_max_lookback(): w1 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5))) w2 = ibis.trailing_window(rows_with_max_lookback(5, ibis.interval(days=7))) w3 = w1.combine(w2) expected = ibis.trailing_window( rows_with_max_lookback(3, ibis.interval(days=5))) assert_equal(w3, expected)
def test_scope_look_up(): # test if scope could lookup items properly scope = Scope() one_day = ibis.interval(days=1).op() one_hour = ibis.interval(hours=1).op() scope = scope.merge_scope(Scope({one_day: 1}, None)) assert scope.get_value(one_hour) is None assert scope.get_value(one_day) is not None
def test_timestamp_scalar_in_filter(alltypes, translate): table = alltypes expr = table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.interval(weeks=3)), table.timestamp_col < (ibis.now() + ibis.interval(days=10)), ]).count() expr.execute()
def setup(self): n = 30 * int(2e5) data = pd.DataFrame( { 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range( start='now', periods=n, freq='s' ).values, 'timestamp_strings': pd.date_range( start='now', periods=n, freq='s' ).values.astype(str), 'repeated_timestamps': pd.date_range( start='2018-09-01', periods=30 ).repeat(int(n / 30)), } ) t = ibis.pandas.connect({'df': data}).table('df') self.high_card_group_by = t.groupby(t.key).aggregate( avg_value=t.value.mean() ) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = ( t.mutate(dates=t.timestamps.cast('date')) .groupby(['low_card_key', 'dates']) .aggregate(avg_value=lambda t: t.value.mean()) ) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[ ['low_card_key', 'key', 'value'] ].sort_by(['low_card_key', 'key']) low_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over(low_card_window) high_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
def test_timestamp_scalar_in_filter(alltypes): # #310 table = alltypes expr = table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.interval(months=3)), table.timestamp_col < (ibis.now() + ibis.interval(days=10)), ]).count() expr.execute()
def test_timestamp_scalar_in_filter(alltypes, translate): table = alltypes expr = table.filter( [ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.interval(weeks=3)), table.timestamp_col < (ibis.now() + ibis.interval(days=10)), ] ).count() expr.execute()
def test_literal_equality_interval(): a = ibis.interval(seconds=1).op() b = ibis.interval(minutes=1).op() assert a != b # Currently these does't equal, but perhaps should be? c = ibis.interval(seconds=60).op() d = ibis.interval(minutes=1).op() assert c != d
def test_decimal_timestamp_builtins(con): table = con.table('tpch_lineitem') dc = table.l_quantity ts = table.l_receiptdate.cast('timestamp') exprs = [ dc % 10, dc + 5, dc + dc, dc / 2, dc * 2, dc**2, dc.cast('double'), api.where(table.l_discount > 0, dc * table.l_discount, api.NA), dc.fillna(0), ts < (ibis.now() + ibis.interval(months=3)), ts < (ibis.timestamp('2005-01-01') + ibis.interval(months=3)), # hashing dc.hash(), ts.hash(), # truncate ts.truncate('y'), ts.truncate('q'), ts.truncate('month'), ts.truncate('d'), ts.truncate('w'), ts.truncate('h'), ts.truncate('minute'), ] timestamp_fields = [ 'years', 'months', 'days', 'hours', 'minutes', 'seconds', 'weeks', ] for field in timestamp_fields: if hasattr(ts, field): exprs.append(getattr(ts, field)()) offset = ibis.interval(**{field: 2}) exprs.append(ts + offset) exprs.append(ts - offset) proj_exprs = [expr.name('e%d' % i) for i, expr in enumerate(exprs)] projection = table[proj_exprs].limit(10) projection.execute()
def setup(self): n = 30 * int(2e5) data = pd.DataFrame({ 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range(start='now', periods=n, freq='s').values, 'timestamp_strings': pd.date_range(start='now', periods=n, freq='s').values.astype(str), 'repeated_timestamps': pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)), }) t = ibis.pandas.connect({'df': data}).table('df') self.high_card_group_by = t.groupby( t.key).aggregate(avg_value=t.value.mean()) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = (t.mutate( dates=t.timestamps.cast('date')).groupby( ['low_card_key', 'dates']).aggregate(avg_value=lambda t: t.value.mean())) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[['low_card_key', 'key', 'value' ]].sort_by(['low_card_key', 'key']) low_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over(low_card_window) high_card_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over(high_card_window)
def test_setting_timecontext_in_scope(time_table, time_df3): expected_win_1 = ( time_df3.compute() .set_index('time') .value.rolling('3d', closed='both') .mean() ) expected_win_1 = expected_win_1[ expected_win_1.index >= Timestamp('20170105') ].reset_index(drop=True) context = Timestamp('20170105'), Timestamp('20170111') window1 = ibis.trailing_window( 3 * ibis.interval(days=1), order_by=time_table.time ) """ In the following expression, Selection node will be executed first and get table in context ('20170105', '20170101'). Then in window execution table will be executed again with a larger context adjusted by window preceeding days ('20170102', '20170111'). To get the correct result, the cached table result with a smaller context must be discard and updated to a larger time range. """ expr = time_table.mutate(value=time_table['value'].mean().over(window1)) result = expr.execute(timecontext=context) tm.assert_series_equal(result["value"], expected_win_1)
def test_window_rows_with_max_lookback(con): t = con.table('alltypes') mlb = rows_with_max_lookback(3, ibis.interval(days=3)) w = ibis.trailing_window(mlb, order_by=t.i) expr = t.a.sum().over(w) with pytest.raises(NotImplementedError): ImpalaCompiler.to_sql(expr)
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=4 * ibis.interval(days=1))[time_keyed_left, time_keyed_right.other_value] context = (Timestamp('20170105'), Timestamp('20170111')) result = expr.execute(timecontext=context) # compare with asof_join of manually trimmed tables trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1]] trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] - Timedelta(days=4)][ time_keyed_df2['time'] < context[1]] expected = dd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=Timedelta('4D'), ).compute() tm.assert_frame_equal(result, expected)
def test_context_adjustment_window_groupby_id(time_table, time_df3): """This test case is meant to test trim_window_result method in dask/execution/window.py to see if it could trim Series correctly with groupby params """ expected = ( time_df3.compute() .set_index('time') .groupby('id') .value.rolling('3d', closed='both') .mean() ) # This is a MultiIndexed Series expected = expected.reset_index() expected = expected[expected.time >= Timestamp('20170105')].reset_index( drop=True )['value'] context = Timestamp('20170105'), Timestamp('20170111') # expected.index.name = None window = ibis.trailing_window( 3 * ibis.interval(days=1), group_by='id', order_by=time_table.time ) expr = time_table['value'].mean().over(window) # result should adjust time context accordingly result = expr.execute(timecontext=context) tm.assert_series_equal(result, expected)
def test_udaf_window_interval(): df = pd.DataFrame( collections.OrderedDict([ ( "time", pd.date_range(start='20190105', end='20190101', freq='-1D'), ), ("key", [1, 2, 1, 2, 1]), ("value", np.arange(5)), ])) con = connect({'df': df}) t = con.table('df') window = ibis.trailing_range_window(ibis.interval(days=2), order_by='time', group_by='key') expr = t.mutate(rolled=my_mean(t.value).over(window)) result = expr.execute().sort_values(['time', 'key']).reset_index(drop=True) expected = (df.sort_values(['time', 'key']).set_index('time').assign( rolled=lambda df: df.groupby('key').value.rolling('2D', closed='both'). mean().reset_index(level=0, drop=True))).reset_index(drop=False) tm.assert_frame_equal(result, expected)
def test_window_with_mlb(): index = pd.date_range('20170501', '20170507') data = np.random.randn(len(index), 3) df = ( pd.DataFrame(data, columns=list('abc'), index=index) .rename_axis('time') .reset_index(drop=False) ) client = Backend().connect({'df': df}) t = client.table('df') rows_with_mlb = rows_with_max_lookback(5, ibis.interval(days=10)) expr = t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time', group_by='b') ) ) result = expr.execute() expected = df.set_index('time') gb_df = ( expected.groupby(['b'])['a'] .rolling('10d', closed='both') .apply(lambda s: s.iloc[-5:].sum(), raw=False) .sort_index(level=['time']) .reset_index(drop=True) ) expected = expected.reset_index(drop=False).assign(sum=gb_df) tm.assert_frame_equal(result, expected) rows_with_mlb = rows_with_max_lookback(5, 10) with pytest.raises(com.IbisInputError): t.mutate( sum=lambda df: df.a.sum().over( ibis.trailing_window(rows_with_mlb, order_by='time') ) )
def prep_311_data(file): catalog = intake_civis.open_redshift_catalog() expr = catalog.public.import311.to_ibis() recent_srs = expr[(expr.createddate > (ibis.now() - ibis.interval(months=6))) & (expr.requesttype != "Homeless Encampment")] df = recent_srs.execute() df.to_csv(file, index=False)
def test_where_analyze_scalar_op(functional_alltypes): # root cause of #310 table = functional_alltypes expr = table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.interval(months=3)), table.timestamp_col < (ibis.now() + ibis.interval(days=10)), ]).count() result = Compiler.to_sql(expr) expected = """\ SELECT count(*) AS `count` FROM functional_alltypes WHERE (`timestamp_col` < date_add(cast({} as timestamp), INTERVAL 3 MONTH)) AND (`timestamp_col` < date_add(cast(now() as timestamp), INTERVAL 10 DAY))""" # noqa: E501 assert result == expected.format("'2010-01-01 00:00:00'")
def test_adjust_context_complete_shift( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2, ): """Test `adjust_context` function that completely shifts the context. This results in an adjusted context that is NOT a subset of the original context. This is unlike an `adjust_context` function that only expands the context. See #3104 """ # Create a contrived `adjust_context` function for # CustomAsOfJoin to mock this. @adjust_context.register(CustomAsOfJoin) def adjust_context_custom_asof_join( op: ops.AsOfJoin, timecontext: TimeContext, scope: Optional[Scope] = None, ) -> TimeContext: """Shifts both the begin and end in the same direction.""" begin, end = timecontext timedelta = execute(op.tolerance) return (begin - timedelta, end - timedelta) expr = CustomAsOfJoin( left=time_keyed_left, right=time_keyed_right, predicates='time', by='key', tolerance=ibis.interval(days=4), ).to_expr() expr = expr[time_keyed_left, time_keyed_right.other_value] context = (pd.Timestamp('20170101'), pd.Timestamp('20170111')) result = expr.execute(timecontext=context) # Compare with asof_join of manually trimmed tables # Left table: No shift for context # Right table: Shift both begin and end of context by 4 days trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1]] trimmed_df2 = time_keyed_df2[ time_keyed_df2['time'] >= context[0] - pd.Timedelta(days=4)][ time_keyed_df2['time'] < context[1] - pd.Timedelta(days=4)] expected = pd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=pd.Timedelta('4D'), ) tm.assert_frame_equal(result, expected)
def test_rolling_window_with_mlb(alltypes): t = alltypes window = ibis.trailing_window( preceding=rows_with_max_lookback(3, ibis.interval(days=5)), order_by=t.timestamp_col, ) expr = t['double_col'].sum().over(window) with pytest.raises(NotImplementedError): expr.execute()
def test_complex_window(client): """ Test window with different sizes mix context adjustment for window op that require context adjustment and non window op that doesn't adjust context """ table = client.table('time_indexed_table') context = ( pd.Timestamp('20170102 07:00:00', tz='UTC'), pd.Timestamp('20170105', tz='UTC'), ) window = ibis.trailing_window(preceding=ibis.interval(hours=1), order_by='time', group_by='key') window2 = ibis.trailing_window(preceding=ibis.interval(hours=2), order_by='time', group_by='key') window_cum = ibis.cumulative_window(order_by='time', group_by='key') # context should be adjusted accordingly for each window result_pd = (table.mutate( count_1h=table['value'].count().over(window), count_2h=table['value'].count().over(window2), count_cum=table['value'].count().over(window_cum), ).mutate(count=table['value'].count()).execute(timecontext=context)) df = table.execute() expected_win_1h = (df.set_index('time').groupby('key').value.rolling( '1h', closed='both').count().rename('count_1h').astype(int)) expected_win_2h = (df.set_index('time').groupby('key').value.rolling( '2h', closed='both').count().rename('count_2h').astype(int)) expected_cum_win = (df.set_index('time').groupby( 'key').value.expanding().count().rename('count_cum').astype(int)) df = df.set_index('time') df = df.assign(count_1h=expected_win_1h.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df = df.assign(count_2h=expected_win_2h.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df = df.assign(count_cum=expected_cum_win.sort_index( level=['time', 'key']).reset_index(level='key', drop=True)) df['count'] = df.groupby(['key'])['value'].transform('count') df = df.reset_index() expected = (df[df.time.between(*(t.tz_convert(None) for t in context))].sort_values( ['key']).reset_index(drop=True)) tm.assert_frame_equal(result_pd, expected)
def timeunit(transform: dict, expr: ibis.Expr) -> ibis.Expr: """ Apply a vega time unit transform to an ibis expression. https://vega.github.io/vega/docs/transforms/timeunit/ It transforms it into the Ibis truncate expression. https://docs.ibis-project.org/generated/ibis.expr.api.TimestampValue.truncate.html Parameters ---------- transform: dict A JSON-able dictionary representing the vega transform. expr: ibis.Expr The expression to which to apply the transform. Returns ------- transformed_expr: the transformed expression """ assert transform.pop("type") == "timeunit" field = expr[transform.pop("field")] as_start, as_end = transform.pop("as") units = transform.pop("units") if transform: raise NotImplementedError( f"timeunit transform: {list(transform)} keys are not supported") if units == ["year"]: start = field.truncate("Y") delta = ibis.interval(years=1) elif units == ["year", "month"]: start = field.truncate("M") delta = ibis.interval(months=1) elif units == ["year", "month", "date"]: start = field.truncate("D") delta = ibis.interval(days=1) elif units == ["year", "month", "date", "hours"]: start = field.truncate("h") delta = ibis.interval(hours=1) elif units == ["year", "month", "date", "hours", "minutes"]: start = field.truncate("m") delta = ibis.interval(minutes=1) elif units == ["year", "month", "date", "hours", "minutes", "seconds"]: start = field.truncate("s") delta = ibis.interval(seconds=1) elif units == [ "year", "month", "date", "hours", "minutes", "seconds", "milliseconds", ]: start = field.truncate("ms") delta = ibis.interval(milliseconds=1) else: raise NotImplementedError( f"timeunit transform: {units} units are not supported") return expr.mutate([start.name(as_start), (start + delta).name(as_end)])
def test_multiple_windows(client): table = client.table('time_indexed_table') window1 = ibis.trailing_window(preceding=ibis.interval(hours=1), order_by='time', group_by='key') window2 = ibis.trailing_window(preceding=ibis.interval(hours=2), order_by='time', group_by='key') result = table.mutate( mean_1h=table['value'].mean().over(window1), mean_2h=table['value'].mean().over(window2), ).compile() result_pd = result.toPandas() df = table.compile().toPandas() expected_win_1 = (df.set_index('time').groupby('key').value.rolling( '1h', closed='both').mean().rename('mean_1h')).reset_index(drop=True) expected_win_2 = (df.set_index('time').groupby('key').value.rolling( '2h', closed='both').mean().rename('mean_2h')).reset_index(drop=True) tm.assert_series_equal(result_pd['mean_1h'], expected_win_1) tm.assert_series_equal(result_pd['mean_2h'], expected_win_2)
def test_window_equals(alltypes): t = alltypes w1 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.b) w2 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.b) assert w1.equals(w2) w3 = ibis.window(preceding=1, following=2, group_by=t.a, order_by=t.c) assert not w1.equals(w3) w4 = ibis.range_window(preceding=ibis.interval(hours=3), group_by=t.d) w5 = ibis.range_window(preceding=ibis.interval(hours=3), group_by=t.d) assert w4.equals(w5) w6 = ibis.range_window(preceding=ibis.interval(hours=1), group_by=t.d) assert not w4.equals(w6) w7 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5)), group_by=t.a, order_by=t.b) w8 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(days=5)), group_by=t.a, order_by=t.b) assert w7.equals(w8) w9 = ibis.trailing_window(rows_with_max_lookback(3, ibis.interval(months=5)), group_by=t.a, order_by=t.b) assert not w7.equals(w9)
def test_keyed_asof_join_with_tolerance( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=2 * ibis.interval(days=1) )[time_keyed_left, time_keyed_right.other_value] result = expr.execute() expected = pd.merge_asof( time_keyed_df1, time_keyed_df2, on='time', by='key', tolerance=pd.Timedelta('2D')) tm.assert_frame_equal(result[expected.columns], expected)
def test_context_adjustment_filter_before_window(alltypes, context, ctx_col): window = ibis.trailing_window(ibis.interval(days=3), order_by=ORDER_BY_COL) expr = alltypes[alltypes['bool_col']] expr = expr.mutate(v1=expr[TARGET_COL].count().over(window)) result = expr.execute(timecontext=context) expected = expr.execute() expected = filter_by_time_context(expected, context) expected = expected.reset_index(drop=True) tm.assert_frame_equal(result, expected)
def test_context_adjustment_multi_window(time_table, time_df3): expected_win_1 = ( time_df3.compute() .set_index('time') .rename(columns={'value': 'v1'})['v1'] .rolling('3d', closed='both') .mean() ) expected_win_1 = expected_win_1[ expected_win_1.index >= Timestamp('20170105') ].reset_index(drop=True) expected_win_2 = ( time_df3.compute() .set_index('time') .rename(columns={'value': 'v2'})['v2'] .rolling('2d', closed='both') .mean() ) expected_win_2 = expected_win_2[ expected_win_2.index >= Timestamp('20170105') ].reset_index(drop=True) context = Timestamp('20170105'), Timestamp('20170111') window1 = ibis.trailing_window( 3 * ibis.interval(days=1), order_by=time_table.time ) window2 = ibis.trailing_window( 2 * ibis.interval(days=1), order_by=time_table.time ) expr = time_table.mutate( v1=time_table['value'].mean().over(window1), v2=time_table['value'].mean().over(window2), ) result = expr.execute(timecontext=context) tm.assert_series_equal(result["v1"], expected_win_1) tm.assert_series_equal(result["v2"], expected_win_2)
def test_timestamp_deltas(table, unit, compiled_unit): f = '`i`' K = 5 offset = ibis.interval(**{unit: K}) add_expr = table.i + offset result = translate(add_expr) assert result == f'date_add({f}, INTERVAL {K} {compiled_unit})' sub_expr = table.i - offset result = translate(sub_expr) assert result == f'date_sub({f}, INTERVAL {K} {compiled_unit})'
def test_replace_window(alltypes): t = alltypes w1 = ibis.window(preceding=5, following=1, group_by=t.a, order_by=t.b) w2 = w1.group_by(t.c) expected = ibis.window( preceding=5, following=1, group_by=[t.a, t.c], order_by=t.b ) assert_equal(w2, expected) w3 = w1.order_by(t.d) expected = ibis.window( preceding=5, following=1, group_by=t.a, order_by=[t.b, t.d] ) assert_equal(w3, expected) w4 = ibis.trailing_window( rows_with_max_lookback(3, ibis.interval(months=3)) ) w5 = w4.group_by(t.a) expected = ibis.trailing_window( rows_with_max_lookback(3, ibis.interval(months=3)), group_by=t.a ) assert_equal(w5, expected)
def test_keyed_asof_join_with_tolerance( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=2 * ibis.interval(days=1) )[time_keyed_left, time_keyed_right.other_value] result = expr.execute() expected = pd.merge_asof( time_keyed_df1, time_keyed_df2, on='time', by='key', tolerance=pd.Timedelta('2D'), ) tm.assert_frame_equal(result[expected.columns], expected)
def test_window_with_preceding_expr(index): time = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(time)) df = pd.DataFrame({'value': data, 'time': time}, index=index(time)) client = ibis.pandas.connect({'df': df}) t = client.table('df') expected = (df.set_index('time').value.rolling( '3d', closed='both').mean().reset_index(drop=True)) expected.index.name = None day = ibis.interval(days=1) window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
def test_context_adjustment_window(time_table, time_df3): # trim data manually expected = (time_df3.set_index('time').value.rolling('3d', closed='both').mean()) expected = expected[ expected.index >= pd.Timestamp('20170105')].reset_index(drop=True) context = pd.Timestamp('20170105'), pd.Timestamp('20170111') # expected.index.name = None window = ibis.trailing_window(3 * ibis.interval(days=1), order_by=time_table.time) expr = time_table['value'].mean().over(window) # result should adjust time context accordingly result = expr.execute(timecontext=context) tm.assert_series_equal(result, expected)
def test_adjust_context_scope(client): """Test that `adjust_context` has access to `scope` by default.""" table = client.table('time_indexed_table') # WindowOp is the only context-adjusted node that the PySpark backend # can compile. Ideally we would test the context adjustment logic for # WindowOp itself, but building this test like that would unfortunately # affect other tests that involve WindowOp. # To avoid that, we'll create a dummy subclass of WindowOp and build the # test around that. class CustomWindowOp(ops.WindowOp): pass # Tell the Spark backend compiler it should compile CustomWindowOp just # like WindowOp compiles(CustomWindowOp)(compile_window_op) # Create an `adjust_context` function for this subclass that simply checks # that `scope` is passed in. @adjust_context.register(CustomWindowOp) def adjust_context_window_check_scope( op: CustomWindowOp, scope: Scope, timecontext: TimeContext, ) -> TimeContext: """Confirms that `scope` is passed in.""" assert scope is not None return timecontext # Do an operation that will trigger context adjustment # on a CustomWindowOp value_count = table['value'].count() win = ibis.window( ibis.interval(hours=1), 0, order_by='time', group_by='key', ) # the argument needs to be pull out from the alias # any extensions must do the same value_count_over_win = CustomWindowOp(value_count.op().arg, win).to_expr() expr = table.mutate(value_count_over_win.name('value_count_over_win')) context = (pd.Timestamp('20170105'), pd.Timestamp('20170111')) expr.execute(timecontext=context)
def test_max_rows_with_lookback_validate(alltypes): t = alltypes mlb = rows_with_max_lookback(3, ibis.interval(days=5)) window = ibis.trailing_window(mlb, order_by=t.i) t.f.lag().over(window) window = ibis.trailing_window(mlb) with pytest.raises(com.IbisInputError): t.f.lag().over(window) window = ibis.trailing_window(mlb, order_by=t.a) with pytest.raises(com.IbisInputError): t.f.lag().over(window) window = ibis.trailing_window(mlb, order_by=[t.i, t.a]) with pytest.raises(com.IbisInputError): t.f.lag().over(window)
def test_window_with_preceding_expr(): index = pd.date_range('20180101', '20180110') start = 2 data = np.arange(start, start + len(index)) df = pd.DataFrame({'value': data, 'time': index}, index=index) client = ibis.pandas.connect({'df': df}) t = client.table('df') expected = ( df.set_index('time') .value.rolling('3d', closed='both') .mean() .reset_index(drop=True) ) expected.index.name = None day = ibis.interval(days=1) window = ibis.trailing_window(3 * day, order_by=t.time) expr = t.value.mean().over(window) result = expr.execute() tm.assert_series_equal(result, expected)
expr = t.mutate(win_avg=t.float_col.mean().over(w3)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (PARTITION BY `year` ORDER BY UNIX_MICROS(`timestamp_col`) RANGE BETWEEN 4 PRECEDING AND 2 PRECEDING) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 project_id ) assert result == expected @pytest.mark.parametrize( ('preceding', 'value'), [ (5, 5), (ibis.interval(nanoseconds=1), 0.001), (ibis.interval(microseconds=1), 1), (ibis.interval(seconds=1), 1000000), (ibis.interval(minutes=1), 1000000 * 60), (ibis.interval(hours=1), 1000000 * 60 * 60), (ibis.interval(days=1), 1000000 * 60 * 60 * 24), (2 * ibis.interval(days=1), 1000000 * 60 * 60 * 24 * 2), (ibis.interval(weeks=1), 1000000 * 60 * 60 * 24 * 7), ], ) def test_trailing_range_window(alltypes, preceding, value, project_id): t = alltypes w = ibis.trailing_range_window( preceding=preceding, order_by=t.timestamp_col ) expr = t.mutate(win_avg=t.float_col.mean().over(w))
[ (rlz.list_of(rlz.double, min_length=2), [1]), (rlz.list_of(rlz.integer), 1.1), (rlz.list_of(rlz.string), 'asd'), (rlz.list_of(identity), 3), ], ) def test_invalid_list_of(validator, values): with pytest.raises(IbisTypeError): validator(values) @pytest.mark.parametrize( ('units', 'value', 'expected'), [ ({'H', 'D'}, ibis.interval(days=3), ibis.interval(days=3)), (['Y'], ibis.interval(years=3), ibis.interval(years=3)), ], ) def test_valid_interval(units, value, expected): result = rlz.interval(value, units=units) assert result.equals(expected) @pytest.mark.parametrize( ('units', 'value', 'expected'), [ ({'Y'}, ibis.interval(hours=1), IbisTypeError), ({'Y', 'M', 'D'}, ibis.interval(hours=1), IbisTypeError), ({'Q', 'W', 'D'}, ibis.interval(seconds=1), IbisTypeError), ],
pytestmark = pytest.mark.pandas @pytest.fixture(scope='session') def sort_kind(): return 'mergesort' default = pytest.mark.parametrize('default', [ibis.NA, ibis.literal('a')]) row_offset = pytest.mark.parametrize( 'row_offset', list(map(ibis.literal, [-1, 1, 0])) ) range_offset = pytest.mark.parametrize( 'range_offset', [ ibis.interval(days=1), 2 * ibis.interval(days=1), -2 * ibis.interval(days=1), ], ) @pytest.fixture def row_window(): return ibis.window(following=0, order_by='plain_int64') @pytest.fixture def range_window(): return ibis.window(following=0, order_by='plain_datetimes_naive')
array = alltypes.date_string_col.split('/') month, day, year = array[0], array[1], array[2] date_col = ibis.literal('-').join(['20' + year, month, day]).cast('date') with pytest.raises(TypeError): date_col + interval date_value = pd.Timestamp('2017-12-31') timestamp_value = pd.Timestamp('2018-01-01 18:18:18') @pytest.mark.parametrize( ('expr_fn', 'expected_fn'), [ param( lambda t, be: t.timestamp_col + ibis.interval(days=4), lambda t, be: t.timestamp_col + pd.Timedelta(days=4), id='timestamp-add-interval', ), param( lambda t, be: t.timestamp_col - ibis.interval(days=17), lambda t, be: t.timestamp_col - pd.Timedelta(days=17), id='timestamp-subtract-interval', ), param( lambda t, be: t.timestamp_col.date() + ibis.interval(days=4), lambda t, be: t.timestamp_col.dt.floor('d') + pd.Timedelta(days=4), id='date-add-interval', ), param( lambda t, be: t.timestamp_col.date() - ibis.interval(days=14),
def test_comparison_timestamp(alltypes): expr = alltypes.i > alltypes.i.min() + ibis.interval(days=3) assert isinstance(expr, ir.BooleanColumn)
s = t.a + t.d assert s.type().nullable is True s = t.b + t.d assert s.type().nullable is True s = t.b + t.f assert s.type().nullable is False @pytest.mark.parametrize( 'base_expr', [ ibis.table([('interval_col', dt.Interval(unit='D'))]).interval_col, ibis.interval(seconds=42), ], ) def test_interval_negate(base_expr): expr = -base_expr expr2 = base_expr.negate() expr3 = ibis.negate(base_expr) assert isinstance(expr.op(), ops.Negate) assert expr.equals(expr2) assert expr.equals(expr3) def test_large_timestamp(): expr = ibis.timestamp('4567-02-03') expected = datetime(year=4567, month=2, day=3) result = expr.op().value