def test_window_rows_with_max_lookback(con): t = con.table('alltypes') mlb = rows_with_max_lookback(3, ibis.interval(days=3)) w = ibis.trailing_window(mlb, order_by=t.i) expr = t.a.sum().over(w) with pytest.raises(NotImplementedError): ImpalaCompiler.to_sql(expr)
def test_unsupported_aggregate_functions(alltypes, column, op): t = alltypes w = ibis.window(order_by=t.d) expr = getattr(t[column], op)() proj = t.projection([expr.over(w).name('foo')]) with pytest.raises(com.TranslationError): ImpalaCompiler.to_sql(proj)
def test_cumulative_functions(alltypes, cumulative, static): t = alltypes w = ibis.window(order_by=t.d) actual = cumulative(t, w).name('foo') expected = static(t, w).over(ibis.cumulative_window()).name('foo') expr1 = t.projection(actual) expr2 = t.projection(expected) assert ImpalaCompiler.to_sql(expr1) == ImpalaCompiler.to_sql(expr2)
def test_isin_notin_in_select(table): values = {'foo', 'bar'} values_formatted = tuple(values) filtered = table[table.g.isin(values)] result = ImpalaCompiler.to_sql(filtered) expected = f"""SELECT * FROM alltypes WHERE `g` IN {values_formatted}""" assert result == expected filtered = table[table.g.notin(values)] result = ImpalaCompiler.to_sql(filtered) expected = f"""SELECT * FROM alltypes WHERE `g` NOT IN {values_formatted}""" assert result == expected
def test_identical_to(con): t = con.table('functional_alltypes') expr = t.tinyint_col.identical_to(t.double_col) result = ImpalaCompiler.to_sql(expr) expected = """\ SELECT `tinyint_col` IS NOT DISTINCT FROM `double_col` AS `tmp` FROM ibis_testing.`functional_alltypes`""" assert result == expected
def test_is_parens_identical_to(): t = ibis.table([('a', 'string'), ('b', 'string')], 'table') expr = t[t.a.identical_to(None) == t.b.identical_to(None)] result = ImpalaCompiler.to_sql(expr) expected = """\ SELECT * FROM `table` WHERE (`a` IS NOT DISTINCT FROM NULL) = (`b` IS NOT DISTINCT FROM NULL)""" assert result == expected
def test_join_aliasing(): test = ibis.table([('a', 'int64'), ('b', 'int64'), ('c', 'int64')], name='test_table') test = test.mutate(d=test.a + 20) test2 = test[test.d, test.c] idx = (test2.d / 15).cast('int64').name('idx') test3 = test2.groupby([test2.d, idx, test2.c]).aggregate(row_count=test2.count()) test3_totals = test3.groupby( test3.d).aggregate(total=test3.row_count.sum()) test4 = test3.join(test3_totals, test3.d == test3_totals.d)[test3, test3_totals.total] test5 = test4[test4.row_count < test4.total / 2] agg = (test.groupby([test.d, test.b]).aggregate(count=test.count(), unique=test.c.nunique()).view()) joined = agg.join(test5, agg.d == test5.d)[agg, test5.total] result = joined result = ImpalaCompiler.to_sql(result) expected = """\ WITH t0 AS ( SELECT `d`, `c` FROM t2 ), t1 AS ( SELECT `d`, CAST(`d` / 15 AS bigint) AS `idx`, `c`, count(*) AS `row_count` FROM t0 GROUP BY 1, 2, 3 ), t2 AS ( SELECT *, `a` + 20 AS `d` FROM test_table ) SELECT t3.*, t4.`total` FROM ( SELECT `d`, `b`, count(*) AS `count`, count(DISTINCT `c`) AS `unique` FROM t2 GROUP BY 1, 2 ) t3 INNER JOIN ( SELECT t5.* FROM ( SELECT t1.*, t8.`total` FROM t1 INNER JOIN ( SELECT `d`, sum(`row_count`) AS `total` FROM t1 GROUP BY 1 ) t8 ON t1.`d` = t8.`d` ) t5 WHERE t5.`row_count` < (t5.`total` / 2) ) t4 ON t3.`d` = t4.`d`""" assert result == expected
def test_relabel_projection(): # GH #551 types = ['int32', 'string', 'double'] table = ibis.table(zip(['foo', 'bar', 'baz'], types), name='table') relabeled = table.relabel({'foo': 'one', 'baz': 'three'}) result = ImpalaCompiler.to_sql(relabeled) expected = """\ SELECT `foo` AS `one`, `bar`, `baz` AS `three` FROM `table`""" assert result == expected
def test_is_parens(method, sql): t = ibis.table([('a', 'string'), ('b', 'string')], 'table') func = operator.methodcaller(method) expr = t[func(t.a) == func(t.b)] result = ImpalaCompiler.to_sql(expr) expected = """\ SELECT * FROM `table` WHERE (`a` {sql} NULL) = (`b` {sql} NULL)""".format(sql=sql) assert result == expected
def test_join_no_predicates_for_impala(con): # Impala requires that joins without predicates be written explicitly # as CROSS JOIN, since result sets can accidentally get too large if a # query is executed before predicates are written t1 = con.table('star1') t2 = con.table('star2') joined2 = t1.cross_join(t2)[[t1]] expected = """\ SELECT t0.* FROM star1 t0 CROSS JOIN star2 t1""" result2 = ImpalaCompiler.to_sql(joined2) assert result2 == expected for jtype in ['inner_join', 'left_join', 'outer_join']: joined = getattr(t1, jtype)(t2)[[t1]] result = ImpalaCompiler.to_sql(joined) assert result == expected
def test_sql_extract(table): # integration with SQL translation expr = table[table.i.year().name('year'), table.i.month().name('month'), table.i.day().name('day'), ] result = ImpalaCompiler.to_sql(expr) expected = """\ SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`, extract(`i`, 'day') AS `day` FROM alltypes""" assert result == expected
def test_join_with_nested_xor_condition(): t1 = ibis.table([('a', 'string'), ('b', 'string')], 't') t2 = t1.view() joined = t1.join(t2, [t1.a == t2.a, (t1.a != t2.b) ^ (t1.b != t2.a)]) expr = joined[t1] expected = """\ SELECT t0.* FROM t t0 INNER JOIN t t1 ON (t0.`a` = t1.`a`) AND (((t0.`a` != t1.`b`) OR (t0.`b` != t1.`a`)) AND NOT ((t0.`a` != t1.`b`) AND (t0.`b` != t1.`a`)))""" # noqa: E501 assert ImpalaCompiler.to_sql(expr) == expected
def test_logically_negate_complex_boolean_expr(): t = ibis.table( [('a', 'string'), ('b', 'double'), ('c', 'int64'), ('d', 'string')], name='t', ) def f(t): return t.a.isin(['foo']) & t.c.notnull() expr = f(t) result = ImpalaCompiler.to_sql(~expr) expected = """\ SELECT NOT (`a` IN ('foo') AND (`c` IS NOT NULL)) AS `tmp` FROM t""" assert result == expected
def _check_impala_output_types_match(con, table): query = ImpalaCompiler.to_sql(table) t = con.sql(query) def _clean_type(x): if isinstance(x, Category): x = x.to_integer_type() return x left, right = t.schema(), table.schema() for i, (n, left, right) in enumerate(zip(left.names, left.types, right.types)): left = _clean_type(left) right = _clean_type(right) if left != right: pytest.fail('Value for {} had left type {}' ' and right type {}'.format(n, left, right))
def test_multiple_filters(): t = ibis.table([('a', 'int64'), ('b', 'string')], name='t0') filt = t[t.a < 100] expr = filt[filt.a == filt.a.max()] result = ImpalaCompiler.to_sql(expr) expected = """\ SELECT * FROM ( SELECT * FROM t0 WHERE `a` < 100 ) t0 WHERE `a` = ( SELECT max(`a`) AS `max` FROM t0 WHERE `a` < 100 )""" assert result == expected
def test_nested_joins_single_cte(): t = ibis.table([('uuid', 'string'), ('ts', 'timestamp')], name='t') counts = t.group_by('uuid').size() last_visit = t.group_by('uuid').aggregate(last_visit=t.ts.max()) max_counts = counts.group_by('uuid').aggregate( max_count=counts['count'].max()) main_kw = max_counts.left_join( counts, ['uuid', max_counts.max_count == counts['count']]).projection([counts]) result = main_kw.left_join(last_visit, 'uuid').projection( [main_kw, last_visit.last_visit]) expected = """\ WITH t0 AS ( SELECT `uuid`, count(*) AS `count` FROM t GROUP BY 1 ) SELECT t1.*, t2.`last_visit` FROM ( SELECT t0.* FROM ( SELECT `uuid`, max(`count`) AS `max_count` FROM t0 GROUP BY 1 ) t3 LEFT OUTER JOIN t0 ON (t3.`uuid` = t0.`uuid`) AND (t3.`max_count` = t0.`count`) ) t1 LEFT OUTER JOIN ( SELECT `uuid`, max(`ts`) AS `last_visit` FROM t GROUP BY 1 ) t2 ON t1.`uuid` = t2.`uuid`""" compiled_result = ImpalaCompiler.to_sql(result) assert compiled_result == expected
def test_bucket_assign_labels(table): buckets = [0, 10, 25, 50] bucket = table.f.bucket(buckets, include_under=True) size = table.group_by(bucket.name('tier')).size() labelled = size.tier.label( ['Under 0', '0 to 10', '10 to 25', '25 to 50'], nulls='error' ).name('tier2') expr = size[labelled, size['count']] expected = """\ SELECT CASE `tier` WHEN 0 THEN 'Under 0' WHEN 1 THEN '0 to 10' WHEN 2 THEN '10 to 25' WHEN 3 THEN '25 to 50' ELSE 'error' END AS `tier2`, `count` FROM ( SELECT CASE WHEN `f` < 0 THEN 0 WHEN (0 <= `f`) AND (`f` < 10) THEN 1 WHEN (10 <= `f`) AND (`f` < 25) THEN 2 WHEN (25 <= `f`) AND (`f` <= 50) THEN 3 ELSE CAST(NULL AS tinyint) END AS `tier`, count(*) AS `count` FROM alltypes GROUP BY 1 ) t0""" result = ImpalaCompiler.to_sql(expr) assert result == expected with pytest.raises(ValueError): size.tier.label(list("abc")) with pytest.raises(ValueError): size.tier.label(list("abcde"))
def test_nested_join_base(): t = ibis.table([('uuid', 'string'), ('ts', 'timestamp')], name='t') counts = t.group_by('uuid').size() max_counts = counts.group_by('uuid').aggregate( max_count=lambda x: x['count'].max()) result = max_counts.left_join(counts, 'uuid').projection([counts]) compiled_result = ImpalaCompiler.to_sql(result) expected = """\ WITH t0 AS ( SELECT `uuid`, count(*) AS `count` FROM t GROUP BY 1 ) SELECT t0.* FROM ( SELECT `uuid`, max(`count`) AS `max_count` FROM t0 GROUP BY 1 ) t1 LEFT OUTER JOIN t0 ON t1.`uuid` = t0.`uuid`""" assert compiled_result == expected
def assert_sql_equal(expr, expected): result = ImpalaCompiler.to_sql(expr) assert result == expected
def test_decimal_builtins(con, expr, expected): result = con.execute(expr) assert result == expected, ImpalaCompiler.to_sql(expr)
def test_nested_join_multiple_ctes(): ratings = ibis.table( [ ('userid', 'int64'), ('movieid', 'int64'), ('rating', 'int8'), ('timestamp', 'string'), ], name='ratings', ) movies = ibis.table([('movieid', 'int64'), ('title', 'string')], name='movies') expr = ratings.timestamp.cast('timestamp') ratings2 = ratings['userid', 'movieid', 'rating', expr.name('datetime')] joined2 = ratings2.join(movies, ['movieid'])[ratings2, movies['title']] joined3 = joined2.filter( [joined2.userid == 118205, joined2.datetime.year() > 2001]) top_user_old_movie_ids = joined3.filter( [joined3.userid == 118205, joined3.datetime.year() < 2009])[['movieid']] # projection from a filter was hiding an insidious bug, so we're disabling # that for now see issue #1295 cond = joined3.movieid.isin(top_user_old_movie_ids.movieid) result = joined3[cond] expected = """\ WITH t0 AS ( SELECT `userid`, `movieid`, `rating`, CAST(`timestamp` AS timestamp) AS `datetime` FROM ratings ), t1 AS ( SELECT t0.*, t5.`title` FROM t0 INNER JOIN movies t5 ON t0.`movieid` = t5.`movieid` ) SELECT t2.* FROM ( SELECT t1.* FROM t1 WHERE (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') > 2001) ) t2 WHERE t2.`movieid` IN ( SELECT `movieid` FROM ( SELECT `movieid` FROM ( SELECT t1.* FROM t1 WHERE (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') > 2001) AND (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') < 2009) ) t5 ) t4 )""" result = ImpalaCompiler.to_sql(result) assert result == expected
def test_identical_to_special_case(): expr = ibis.NA.cast('int64').identical_to(ibis.NA.cast('int64')) result = ImpalaCompiler.to_sql(expr) assert result == 'SELECT TRUE AS `tmp`'