def test_window_bind_to_table(t): w = ibis.window(group_by="g", order_by=ibis.desc("f")) w2 = w.bind(t) expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f)) assert_equal(w2, expected)
def test_analytic_functions(self): t = self.alltypes.limit(1000) g = t.group_by("string_col").order_by("double_col") f = t.float_col exprs = [ f.lag(), f.lead(), f.rank(), f.dense_rank(), f.first(), f.last(), f.first().over(ibis.window(preceding=10)), f.first().over(ibis.window(following=10)), ibis.row_number(), f.cumsum(), f.cummean(), f.cummin(), f.cummax(), # boolean cumulative reductions (f == 0).cumany(), (f == 0).cumall(), f.sum(), f.mean(), f.min(), f.max(), ] proj_exprs = [expr.name("e%d" % i) for i, expr in enumerate(exprs)] proj_table = g.mutate(proj_exprs) proj_table.execute()
def test_window_bind_to_table(self): w = ibis.window(group_by='g', order_by=ibis.desc('f')) w2 = w.bind(self.t) expected = ibis.window(group_by=self.t.g, order_by=ibis.desc(self.t.f)) assert_equal(w2, expected)
def test_window_bind_to_table(alltypes): t = alltypes w = ibis.window(group_by='g', order_by=ibis.desc('f')) w2 = w.bind(alltypes) expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f)) assert_equal(w2, expected)
def test_window_function_bind(alltypes): # GH #532 t = alltypes w = ibis.window(group_by=lambda x: x.g, order_by=lambda x: x.f) expr = t.f.lag().over(w) actual_window = expr.op().args[1] expected = ibis.window(group_by=t.g, order_by=t.f) assert_equal(actual_window, expected)
def test_over_auto_bind(self): # GH #542 t = self.t w = ibis.window(group_by='g', order_by='f') expr = t.f.lag().over(w) actual_window = expr.op().args[1] expected = ibis.window(group_by=t.g, order_by=t.f) assert_equal(actual_window, expected)
def test_over_auto_bind(t): # GH #542 t = t w = ibis.window(group_by="g", order_by="f") expr = t.f.lag().over(w) actual_window = expr.op().args[1] expected = ibis.window(group_by=t.g, order_by=t.f) assert_equal(actual_window, expected)
def test_combine_windows(t): t = t w1 = ibis.window(group_by=t.g, order_by=t.f) w2 = ibis.window(preceding=5, following=5) w3 = w1.combine(w2) expected = ibis.window(group_by=t.g, order_by=t.f, preceding=5, following=5) assert_equal(w3, expected) w4 = ibis.window(group_by=t.a, order_by=t.e) w5 = w3.combine(w4) expected = ibis.window(group_by=[t.g, t.a], order_by=[t.f, t.e], preceding=5, following=5) assert_equal(w5, expected)
def test_unsupported_aggregate_functions(alltypes, column, op): t = alltypes w = ibis.window(order_by=t.d) expr = getattr(t[column], op)() proj = t.projection([expr.over(w).name('foo')]) with pytest.raises(com.TranslationError): to_sql(proj)
def test_partitioned_window(self): t = self.alltypes df = t.execute() window = ibis.window( group_by=t.string_col, order_by=t.timestamp_col, preceding=6, following=0, ) def roller(func): def rolled(df): torder = df.sort_values('timestamp_col') rolling = torder.double_col.rolling(7, min_periods=0) return getattr(rolling, func)() return rolled for func in 'mean sum min max'.split(): f = getattr(t.double_col, func) expr = f().over(window).name('double_col') result = t.projection([expr]).execute().double_col expected = df.groupby('string_col').apply( roller(func) ).reset_index(drop=True) tm.assert_series_equal(result, expected)
def test_window_frame_specs(self): t = self.con.table('alltypes') ex_template = """\ SELECT sum(d) OVER (ORDER BY f {0}) AS `foo` FROM alltypes""" cases = [ (window(preceding=0), 'range between current row and unbounded following'), (window(following=0), 'range between unbounded preceding and current row'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(preceding=5, following=0), 'rows between 5 preceding and current row'), (window(preceding=5, following=2), 'rows between 5 preceding and 2 following'), (window(following=2), 'rows between unbounded preceding and 2 following'), (window(following=2, preceding=0), 'rows between current row and 2 following'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(following=[5, 10]), 'rows between 5 following and 10 following'), (window(preceding=[10, 5]), 'rows between 10 preceding and 5 preceding'), # # cumulative windows (ibis.cumulative_window(), 'range between unbounded preceding and current row'), # # trailing windows (ibis.trailing_window(10), 'rows between 10 preceding and current row'), ] for w, frame in cases: w2 = w.order_by(t.f) expr = t.projection([t.d.sum().over(w2).name('foo')]) expected = ex_template.format(frame.upper()) self._check_sql(expr, expected)
def test_row_number_properly_composes_with_arithmetic(con): t = con.table('alltypes') w = ibis.window(order_by=t.f) expr = t.mutate(new=ibis.row_number().over(w) / 2) expected = """\ SELECT *, (row_number() OVER (ORDER BY `f`) - 1) / 2 AS `new` FROM alltypes""" assert_sql_equal(expr, expected)
def test_window_with_arithmetic(self): t = self.alltypes w = ibis.window(order_by=t.timestamp_col) expr = t.mutate(new_col=ibis.row_number().over(w) / 2) df = t.projection(['timestamp_col']).sort_by('timestamp_col').execute() expected = df.assign(new_col=[x / 2. for x in range(len(df))]) result = expr['timestamp_col', 'new_col'].execute() tm.assert_frame_equal(result, expected)
def test_row_number_properly_composes_with_arithmetic(alltypes): t = alltypes w = ibis.window(order_by=t.f) expr = t.mutate(new=ibis.row_number().over(w) / 2) expected = """\ SELECT *, (row_number() OVER (ORDER BY `f`) - 1) / 2 AS `new` FROM ibis_testing.`alltypes`""" assert_sql_equal(expr, expected)
def test_nested_analytic_function(con): t = con.table('alltypes') w = window(order_by=t.f) expr = (t.f - t.f.lag()).lag().over(w).name('foo') result = t.projection([expr]) expected = """\ SELECT lag(`f` - lag(`f`) OVER (ORDER BY `f`)) \ OVER (ORDER BY `f`) AS `foo` FROM alltypes""" assert_sql_equal(result, expected)
def test_nested_analytic_function(alltypes): t = alltypes w = window(order_by=t.f) expr = (t.f - t.f.lag()).lag().over(w).name('foo') result = t.projection([expr]) expected = """\ SELECT lag(`f` - lag(`f`) OVER (ORDER BY `f`)) \ OVER (ORDER BY `f`) AS `foo` FROM ibis_testing.`alltypes`""" assert_sql_equal(result, expected)
def test_nested_analytic_function(self): t = self.con.table('alltypes') w = window(order_by=t.f) expr = (t.f - t.f.lag()).lag().over(w).name('foo') result = t.projection([expr]) expected = """\ SELECT lag(f - lag(f) OVER (ORDER BY f)) \ OVER (ORDER BY f) AS `foo` FROM alltypes""" self._check_sql(result, expected)
def test_udaf_analytic_group_by(con, t, df): expr = zscore(t.c).over(ibis.window(group_by=t.key)) assert isinstance(expr, ir.ColumnExpr) result = expr.execute() def f(s): return s.sub(s.mean()).div(s.std()) expected = df.groupby('key').c.transform(f) tm.assert_series_equal(result, expected)
def test_multiple_windows(self): t = self.con.table('alltypes') w = window(group_by=t.g) expr = t.f.sum().over(w) - t.f.sum() proj = t.projection([t.g, expr.name('result')]) expected = """\ SELECT g, sum(f) OVER (PARTITION BY g) - sum(f) OVER () AS `result` FROM alltypes""" self._check_sql(proj, expected)
def test_multiple_windows(alltypes): t = alltypes w = window(group_by=t.g) expr = t.f.sum().over(w) - t.f.sum() proj = t.projection([t.g, expr.name('result')]) expected = """\ SELECT `g`, sum(`f`) OVER (PARTITION BY `g`) - sum(`f`) OVER () AS `result` FROM ibis_testing.`alltypes`""" assert_sql_equal(proj, expected)
def test_cumulative_functions(alltypes, cumulative, static): t = alltypes w = ibis.window(order_by=t.d) actual = cumulative(t, w).name('foo') expected = static(t, w).over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) assert to_sql(expr1) == to_sql(expr2)
def test_multiple_windows(con): t = con.table('alltypes') w = window(group_by=t.g) expr = t.f.sum().over(w) - t.f.sum() proj = t.projection([t.g, expr.name('result')]) expected = """\ SELECT `g`, sum(`f`) OVER (PARTITION BY `g`) - sum(`f`) OVER () AS `result` FROM alltypes""" assert_sql_equal(proj, expected)
def test_window_function(alltypes, project_id): t = alltypes w1 = ibis.window( preceding=1, following=0, group_by='year', order_by='timestamp_col' ) expr = t.mutate(win_avg=t.float_col.mean().over(w1)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 project_id ) assert result == expected w2 = ibis.window( preceding=0, following=2, group_by='year', order_by='timestamp_col' ) expr = t.mutate(win_avg=t.float_col.mean().over(w2)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 project_id ) assert result == expected w3 = ibis.window( preceding=(4, 2), group_by='year', order_by='timestamp_col' ) expr = t.mutate(win_avg=t.float_col.mean().over(w3)) result = expr.compile() expected = """\ SELECT *, avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN 4 PRECEDING AND 2 PRECEDING) AS `win_avg` FROM `{}.testing.functional_alltypes`""".format( # noqa: E501 project_id ) assert result == expected
def test_window_unbounded(kind, begin, end, expected): t = ibis.table([('a', 'int64')], name='t') kwargs = {kind: (begin, end)} expr = t.a.sum().over(ibis.window(**kwargs)) result = ibis.bigquery.compile(expr) assert ( result == """\ SELECT sum(`a`) OVER (ROWS BETWEEN {}) AS `tmp` FROM t""".format( expected ) )
def test_combine_windows(alltypes): t = alltypes w1 = ibis.window(group_by=t.g, order_by=t.f) w2 = ibis.window(preceding=5, following=5) w3 = w1.combine(w2) expected = ibis.window( group_by=t.g, order_by=t.f, preceding=5, following=5 ) assert_equal(w3, expected) w4 = ibis.window(group_by=t.a, order_by=t.e) w5 = w3.combine(w4) expected = ibis.window( group_by=[t.g, t.a], order_by=[t.f, t.e], preceding=5, following=5 ) assert_equal(w5, expected) # Cannot combine windows of varying types. w6 = ibis.range_window(preceding=5, following=5) with pytest.raises(ibis.common.IbisInputError): w1.combine(w6)
def test_rolling_window(self): t = self.alltypes df = t[['double_col', 'timestamp_col']].execute().sort_values('timestamp_col').reset_index(drop=True) window = ibis.window( order_by=t.timestamp_col, preceding=6, following=0 ) for func in 'mean sum min max'.split(): f = getattr(t.double_col, func) df_f = getattr(df.double_col.rolling(7, min_periods=0), func) result = t.projection([f().over(window).name('double_col')]).execute().double_col expected = df_f() tm.assert_series_equal(result, expected)
def test_unsupported_aggregate_functions(self): t = self.con.table('alltypes') w = ibis.window(order_by=t.d) exprs = [ t.f.approx_nunique(), t.f.approx_median(), t.g.group_concat(), ] for expr in exprs: with self.assertRaises(com.TranslationError): proj = t.projection([expr.over(w).name('foo')]) to_sql(proj)
def test_auto_windowize_analysis_bug(con): # GH #544 t = con.table("airlines") def metric(x): return x.arrdelay.mean().name("avg_delay") annual_delay = t[t.dest.isin(["JFK", "SFO"])].group_by(["dest", "year"]).aggregate(metric) what = annual_delay.group_by("dest") enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean()) expr = annual_delay.avg_delay.mean().name("grand_avg").over(ibis.window(group_by=annual_delay.dest)) expected = annual_delay[annual_delay, expr] assert_equal(enriched, expected)
def test_auto_windowize_analysis_bug(self): # GH #544 t = self.con.table('airlines') annual_delay = (t[t.dest.isin(['JFK', 'SFO'])] .group_by(['dest', 'year']) .aggregate(t.arrdelay.mean().name('avg_delay'))) what = annual_delay.group_by('dest') enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean()) expr = (annual_delay.avg_delay.mean().name('grand_avg') .over(ibis.window(group_by=annual_delay.dest))) expected = annual_delay[annual_delay, expr] assert_equal(enriched, expected)
def test_compose_group_by_apis(self): t = self.t w = ibis.window(group_by=t.g, order_by=t.f) diff = t.d - t.d.lag() grouped = t.group_by('g').order_by('f') expr = grouped[t, diff.name('diff')] expr2 = grouped.mutate(diff=diff) expr3 = grouped.mutate([diff.name('diff')]) window_expr = (t.d - t.d.lag().over(w)).name('diff') expected = t.projection([t, window_expr]) assert_equal(expr, expected) assert_equal(expr, expr2) assert_equal(expr, expr3)
def test_partitioned_window(alltypes, func, df): t = alltypes window = ibis.window( group_by=t.string_col, order_by=t.timestamp_col, preceding=6, following=0, ) def roller(func): def rolled(df): torder = df.sort_values('timestamp_col') rolling = torder.double_col.rolling(7, min_periods=0) return getattr(rolling, func)() return rolled f = getattr(t.double_col, func) expr = f().over(window).name('double_col') result = t.projection([expr]).execute().double_col expected = ( df.groupby('string_col').apply(roller(func)).reset_index(drop=True) ) tm.assert_series_equal(result, expected)
def test_window_has_pre_execute_scope(): called = [0] @pre_execute.register(ops.Lag, Backend) def test_pre_execute(op, client, **kwargs): called[0] += 1 return Scope() data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')} df = pd.DataFrame(data, columns=['key', 'value', 'dup']) client = ibis.pandas.connect({'df': df}) t = client.table('df') window = ibis.window(order_by='value') expr = t.key.lag(1).over(window).name('foo') result = expr.execute() assert result is not None # once in window op at the top to pickup any scope changes before computing # twice in window op when calling execute on the ops.Lag node at the # beginning of execute and once before the actual computation # # this process happens twice because of the pre_execute call on the Alias # operation assert called[0] == 3 + 3
def test_project_scope_does_not_override(t, df): col = t.plain_int64 expr = t[ [ col.name('new_col'), col.sum() .over(ibis.window(group_by='dup_strings')) .name('grouped'), ] ] result = expr.execute() expected = pd.concat( [ df[['plain_int64', 'dup_strings']].rename( columns={'plain_int64': 'new_col'} ), df.groupby('dup_strings') .plain_int64.transform('sum') .reset_index(drop=True) .rename('grouped'), ], axis=1, )[['new_col', 'grouped']] tm.assert_frame_equal(result, expected)
def test_window(backend, alltypes, df, con, result_fn, expected_fn): if not backend.supports_window_operations: pytest.skip( 'Backend {} does not support window operations'.format(backend) ) expr = alltypes.mutate( val=result_fn( alltypes, win=ibis.window( following=0, group_by=[alltypes.string_col], order_by=[alltypes.id], ), ) ) result = expr.execute().set_index('id').sort_index() column = expected_fn(df.sort_values('id').groupby('string_col')) expected = df.assign(val=column).set_index('id').sort_index() left, right = result.val, expected.val backend.assert_series_equal(left, right)
columns = [group_by, order_by, 'G'] expected = ( batting_df[columns].set_index(order_by).groupby(group_by).G.rolling( 4, min_periods=1).sum().rename('rolled')) tm.assert_series_equal( result.set_index([group_by, order_by]).sort_index().rolled, expected.sort_index().astype("int64"), ) @pytest.mark.parametrize( 'window', [ ibis.window(order_by='yearID'), ibis.window(order_by='yearID', group_by='playerID'), ], ) def test_window_failure_mode(batting, batting_df, window): # can't have order by without a following value of 0 expr = batting.mutate(more_values=batting.G.sum().over(window)) with pytest.raises(ibis.common.exceptions.OperationNotDefinedError): expr.execute() def test_scalar_broadcasting(batting, batting_df): expr = batting.mutate(demeaned=batting.G - batting.G.mean()) result = expr.execute() expected = batting_df.assign(demeaned=batting_df.G - batting_df.G.mean()) tm.assert_frame_equal(result, expected)
def row_window(): return ibis.window(following=0, order_by='plain_int64')
def range_window(): return ibis.window(following=0, order_by='plain_datetimes_naive')
import pytest import ibis from ibis.backends.impala.tests.conftest import translate @pytest.fixture(scope="module") def table(mockcon): return mockcon.table("functional_alltypes") @pytest.mark.parametrize( ("expr_fn", "expected"), [ pytest.param( lambda t: ibis.row_number().over(ibis.window(order_by=t.float_col) ), '(row_number() OVER (ORDER BY `float_col`) - 1)', ), pytest.param(lambda t: t.string_col.lag(), 'lag(`string_col`)', id="lag_default"), pytest.param(lambda t: t.string_col.lag(2), 'lag(`string_col`, 2)', id="lag_arg"), pytest.param( lambda t: t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)', id="lag_explicit_default", ), pytest.param(
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s' % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
proj = grouped.mutate([lag, diff, first, last, lag2]) expected = """\ SELECT *, lag(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `lag`, lead(`f`) OVER (PARTITION BY `g` ORDER BY `f`) - `f` AS `fwd_diff`, first_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `first`, last_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `last`, lag(`f`) OVER (PARTITION BY `g` ORDER BY `d`) AS `lag2` FROM ibis_testing.`alltypes`""" assert_sql_equal(proj, expected) @pytest.mark.impala @pytest.mark.parametrize( ['window', 'frame'], [ (window(preceding=0), 'range between current row and unbounded following'), (window(following=0), 'range between unbounded preceding and current row'), (window(preceding=5), 'rows between 5 preceding and unbounded following'), (window(preceding=5, following=0), 'rows between 5 preceding and current row'), (window(preceding=5, following=2), 'rows between 5 preceding and 2 following'), (window(following=2), 'rows between unbounded preceding and 2 following'), (window(following=2, preceding=0), 'rows between current row and 2 following'), (window(preceding=5), 'rows between 5 preceding and unbounded following'),
def test_array_return_type_reduction_window(con, t, df, qs): expr = quantiles(t.b, quantiles=qs).over(ibis.window()) result = expr.execute() expected_raw = df.b.quantile(qs).tolist() expected = pd.Series([expected_raw] * len(df)) tm.assert_series_equal(result, expected)
def test_window_unbounded_invalid(kind, begin, end): kwargs = {kind: (begin, end)} with pytest.raises(com.IbisInputError): ibis.window(**kwargs)
marks=pytest.mark.xfail, ), param( lambda t: t.float_col.first(), lambda t: t.float_col.head(1), id='first', marks=pytest.mark.xfail, ), param( lambda t: t.float_col.last(), lambda t: t.float_col.tail(1), id='last', marks=pytest.mark.xfail, ), param( lambda t: t.float_col.first().over(ibis.window(preceding=10)), lambda t: t, id='first_preceding', marks=pytest.mark.xfail, ), param( lambda t: t.float_col.first().over(ibis.window(following=10)), lambda t: t, id='first_following', marks=pytest.mark.xfail, ), param( lambda t: ibis.row_number(), lambda t: pd.Series(np.arange(len(t))), id='row_number', marks=pytest.mark.xfail,
level=1).reset_index(drop=True)).set_index('id').sort_index()) # discard first 2 rows of each group to account for the shift n = len(gdf) * 2 left, right = result.val.shift(-n), expected.val.shift(-n) backend.assert_series_equal(left, right) @pytest.mark.parametrize( 'window_fn', [ param( lambda t: ibis.window( preceding=2, following=0, group_by=[t.string_col], order_by=[t.id], ), id='preceding-2-following-0', ), param( lambda t: ibis.trailing_window( preceding=2, group_by=[t.string_col], order_by=[t.id]), id='trailing-2', ), ], ) @pytest.mark.xfail_unsupported def test_grouped_bounded_preceding_windows(backend, alltypes, df, con, window_fn): if not backend.supports_window_operations:
def setup(self): n = 30 * int(2e5) self.data = pd.DataFrame({ 'key': np.random.choice(16000, size=n), 'low_card_key': np.random.choice(30, size=n), 'value': np.random.rand(n), 'timestamps': pd.date_range(start='now', periods=n, freq='s').values, 'timestamp_strings': pd.date_range(start='now', periods=n, freq='s').values.astype(str), 'repeated_timestamps': pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)), }) t = ibis.pandas.connect({'df': self.data}).table('df') self.high_card_group_by = t.groupby( t.key).aggregate(avg_value=t.value.mean()) self.cast_to_dates = t.timestamps.cast(dt.date) self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date) self.multikey_group_by_with_mutate = (t.mutate( dates=t.timestamps.cast('date')).groupby( ['low_card_key', 'dates']).aggregate(avg_value=lambda t: t.value.mean())) self.simple_sort = t.sort_by([t.key]) self.simple_sort_projection = t[['key', 'value']].sort_by(['key']) self.multikey_sort = t.sort_by(['low_card_key', 'key']) self.multikey_sort_projection = t[['low_card_key', 'key', 'value' ]].sort_by(['low_card_key', 'key']) low_card_rolling_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.low_card_key, ) self.low_card_grouped_rolling = t.value.mean().over( low_card_rolling_window) high_card_rolling_window = ibis.trailing_range_window( ibis.interval(days=2), order_by=t.repeated_timestamps, group_by=t.key, ) self.high_card_grouped_rolling = t.value.mean().over( high_card_rolling_window) @udf.reduction(['double'], 'double') def my_mean(series): return series.mean() self.low_card_grouped_rolling_udf_mean = my_mean( t.value).over(low_card_rolling_window) self.high_card_grouped_rolling_udf_mean = my_mean( t.value).over(high_card_rolling_window) @udf.analytic(['double'], 'double') def my_zscore(series): return (series - series.mean()) / series.std() low_card_window = ibis.window(group_by=t.low_card_key) high_card_window = ibis.window(group_by=t.key) self.low_card_window_analytics_udf = my_zscore( t.value).over(low_card_window) self.high_card_window_analytics_udf = my_zscore( t.value).over(high_card_window) @udf.reduction(['double', 'double'], 'double') def my_wm(v, w): return np.average(v, weights=w) self.low_card_grouped_rolling_udf_wm = my_wm( t.value, t.value).over(low_card_rolling_window) self.high_card_grouped_rolling_udf_wm = my_wm( t.value, t.value).over(low_card_rolling_window)
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, import_mode, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) elif import_mode == "pandas": # Datafiles import columns_types_converted = [ "float64" if (x.startswith("decimal")) else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types_converted, header=0, nrows=None, compression_type="gzip" if filename.endswith("gz") else None, validation=validation, ) etl_times["t_readcsv"] = round( (t_import_pandas + t_import_ibis) * 1000) elif import_mode == "fsi": try: unzip_name = None if filename.endswith("gz"): import gzip unzip_name = "/tmp/santander-fsi.csv" with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table) etl_times["t_readcsv"] = round((timer() - t0) * 1000) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times
def test_preceding_following_validate(alltypes): # these all work [ ibis.window(preceding=0), ibis.window(following=0), ibis.window(preceding=0, following=0), ibis.window(preceding=(None, 4)), ibis.window(preceding=(10, 4)), ibis.window(following=(4, None)), ibis.window(following=(4, 10)), ] # these are ill-specified error_cases = [ lambda: ibis.window(preceding=(1, 3)), lambda: ibis.window(preceding=(3, 1), following=2), lambda: ibis.window(preceding=(3, 1), following=(2, 4)), lambda: ibis.window(preceding=-1), lambda: ibis.window(following=-1), lambda: ibis.window(preceding=(-1, 2)), lambda: ibis.window(following=(2, -1)), ] for i, case in enumerate(error_cases): with pytest.raises(Exception): case()
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, fragments_size, ): etl_times = {key: 0.0 for key in etl_keys} fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) table_import = omnisci_server_worker.database(database_name).table( table_name) etl_times["t_connect"] += timer() - t0 t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = timer() - t0 elif import_mode == "pandas": # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion # accuracy during import from Pandas into OmniSciDB for proper results validation columns_types = [ "decimal(9, 6)" if (x == "decimal(8, 4)") else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith(".gz") else None, use_columns_types_for_pd=False, ) etl_times["t_readcsv"] = t_import_pandas + t_import_ibis etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": try: unzip_name = None if filename.endswith(".gz"): import gzip unzip_name = get_tmp_filepath("santander-fsi.csv") with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table, fragment_size=fragments_size[0], ) etl_times["t_readcsv"] = timer() - t0 etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time( ) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML t0 = timer() omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) etl_times["t_connect"] += timer() - t0 # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name(col_gt1)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = timer() - t_etl_start return table_df, etl_times
def test_window_group_by(): t = ibis.table(dict(a="int64", b="string"), name="t") expr = t.a.mean().over(ibis.window(group_by=t.b)) result = repr(expr) assert "preceding=0" not in result assert "group_by=[r0.b]" in result
def low_card_window(t): return ibis.window(group_by=t.low_card_key)
proj = grouped.mutate([lag, diff, first, last, lag2]) expected = """\ SELECT *, lag(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `lag`, lead(`f`) OVER (PARTITION BY `g` ORDER BY `f`) - `f` AS `fwd_diff`, first_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `first`, last_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `last`, lag(`f`) OVER (PARTITION BY `g` ORDER BY `d`) AS `lag2` FROM ibis_testing.`alltypes`""" assert_sql_equal(proj, expected) @pytest.mark.parametrize( ['window', 'frame'], [ ( window(preceding=0), 'range between current row and unbounded following', ), ( window(following=0), 'range between unbounded preceding and current row', ), ( window(preceding=5), 'rows between 5 preceding and unbounded following', ), ( window(preceding=5, following=0), 'rows between 5 preceding and current row', ), (
def high_card_window(t): return ibis.window(group_by=t.key)