class TestInteractiveUse(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_interactive_execute_on_repr(self): table = self.con.table('functional_alltypes') expr = table.bigint_col.sum() with config.option_context('interactive', True): repr(expr) assert len(self.con.executed_queries) > 0 def test_default_limit(self): table = self.con.table('functional_alltypes') with config.option_context('interactive', True): repr(table) expected = """\ SELECT * FROM functional_alltypes LIMIT {0}""".format(config.options.sql.default_limit) assert self.con.executed_queries[0] == expected def test_disable_query_limit(self): table = self.con.table('functional_alltypes') with config.option_context('interactive', True): with config.option_context('sql.default_limit', None): repr(table) expected = """\ SELECT * FROM functional_alltypes""" assert self.con.executed_queries[0] == expected def test_interactive_non_compilable_repr_not_fail(self): # #170 table = self.con.table('functional_alltypes') expr = table.string_col.topk(3) # it works! with config.option_context('interactive', True): repr(expr) def test_histogram_repr_no_query_execute(self): t = self.con.table('functional_alltypes') tier = t.double_col.histogram(10).name('bucket') expr = t.group_by(tier).size() with config.option_context('interactive', True): expr._repr() assert self.con.executed_queries == []
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_simple_table_distinct(self): t = self.con.table('functional_alltypes') expr = t[t.string_col, t.int_col].distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col`, `int_col` FROM functional_alltypes""" assert result == expected def test_array_distinct(self): t = self.con.table('functional_alltypes') expr = t.string_col.distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col` FROM functional_alltypes""" assert result == expected def test_count_distinct(self): t = self.con.table('functional_alltypes') metric = t.int_col.nunique().name('nunique') expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric]) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique` FROM functional_alltypes WHERE `bigint_col` > 0 GROUP BY 1""" assert result == expected def test_multiple_count_distinct(self): # Impala and some other databases will not execute multiple # count-distincts in a single aggregation query. This error reporting # will be left to the database itself, for now. t = self.con.table('functional_alltypes') metrics = [ t.int_col.nunique().name('int_card'), t.smallint_col.nunique().name('smallint_card') ] expr = t.group_by('string_col').aggregate(metrics) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`, COUNT(DISTINCT `smallint_col`) AS `smallint_card` FROM functional_alltypes GROUP BY 1""" assert result == expected
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_simple_table_distinct(self): t = self.con.table('functional_alltypes') expr = t[t.string_col, t.int_col].distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col`, `int_col` FROM functional_alltypes""" assert result == expected def test_array_distinct(self): t = self.con.table('functional_alltypes') expr = t.string_col.distinct() result = to_sql(expr) expected = """SELECT DISTINCT `string_col` FROM functional_alltypes""" assert result == expected def test_count_distinct(self): t = self.con.table('functional_alltypes') metric = t.int_col.nunique().name('nunique') expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric]) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique` FROM functional_alltypes WHERE `bigint_col` > 0 GROUP BY 1""" assert result == expected def test_multiple_count_distinct(self): # Impala and some other databases will not execute multiple # count-distincts in a single aggregation query. This error reporting # will be left to the database itself, for now. t = self.con.table('functional_alltypes') metrics = [t.int_col.nunique().name('int_card'), t.smallint_col.nunique().name('smallint_card')] expr = t.group_by('string_col').aggregate(metrics) result = to_sql(expr) expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`, COUNT(DISTINCT `smallint_col`) AS `smallint_card` FROM functional_alltypes GROUP BY 1""" assert result == expected
class TestAnalytics(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') def test_category_project(self): t = self.alltypes tier = t.double_col.bucket([0, 50, 100]).name('tier') expr = t[tier, t] assert isinstance(expr.tier, ir.CategoryArray) def test_bucket(self): d = self.alltypes.double_col bins = [0, 10, 50, 100] expr = d.bucket(bins) assert isinstance(expr, ir.CategoryArray) assert expr.op().nbuckets == 3 expr = d.bucket(bins, include_over=True) assert expr.op().nbuckets == 4 expr = d.bucket(bins, include_over=True, include_under=True) assert expr.op().nbuckets == 5 def test_bucket_error_cases(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.bucket, []) self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo') # it works! d.bucket([10], include_under=True, include_over=True) self.assertRaises(ValueError, d.bucket, [10]) self.assertRaises(ValueError, d.bucket, [10], include_under=True) self.assertRaises(ValueError, d.bucket, [10], include_over=True) def test_histogram(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5) self.assertRaises(ValueError, d.histogram) self.assertRaises(ValueError, d.histogram, 10, closed='foo') def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) filtered = t.filter([delay_filter]) # predicate is unmodified by analysis post_pred = filtered.op().predicates[1] assert delay_filter.equals(post_pred)
def test_ctas_ddl(self): con = MockConnection() select = build_ast(con.table('test1')).queries[0] statement = ksupport.CTASKudu( 'another_table', 'kudu_name', ['dom.d.com:7051'], select, ['string_col'], external=True, can_exist=False, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` TBLPROPERTIES ( 'kudu.key_columns'='string_col', 'kudu.master_addresses'='dom.d.com:7051', 'kudu.table_name'='kudu_name', 'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler' ) AS SELECT * FROM test1""" assert result == expected
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_coalesce(self): t = self.table cases = [ (ibis.coalesce(t.string_col, 'foo'), "coalesce(`string_col`, 'foo')"), (ibis.coalesce(t.int_col, t.bigint_col), 'coalesce(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases) def test_greatest(self): t = self.table cases = [ (ibis.greatest(t.string_col, 'foo'), "greatest(`string_col`, 'foo')"), (ibis.greatest(t.int_col, t.bigint_col), 'greatest(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases) def test_least(self): t = self.table cases = [ (ibis.least(t.string_col, 'foo'), "least(`string_col`, 'foo')"), (ibis.least(t.int_col, t.bigint_col), 'least(`int_col`, `bigint_col`)'), ] self._check_expr_cases(cases)
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_analytic_exprs(self): t = self.table w = ibis.window(order_by=t.float_col) cases = [ (ibis.row_number().over(w), '(row_number() OVER (ORDER BY `float_col`) - 1)'), (t.string_col.lag(), 'lag(`string_col`)'), (t.string_col.lag(2), 'lag(`string_col`, 2)'), (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'), (t.string_col.lead(), 'lead(`string_col`)'), (t.string_col.lead(2), 'lead(`string_col`, 2)'), (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'), (t.double_col.first(), 'first_value(`double_col`)'), (t.double_col.last(), 'last_value(`double_col`)'), # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))') ] self._check_expr_cases(cases)
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table("alltypes") def test_field_in_literals(self): cases = [ (self.table.g.isin(["foo", "bar", "baz"]), "`g` IN ('foo', 'bar', 'baz')"), (self.table.g.notin(["foo", "bar", "baz"]), "`g` NOT IN ('foo', 'bar', 'baz')"), ] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (L(2).isin([self.table.a, self.table.b, self.table.c]), "2 IN (`a`, `b`, `c`)"), (L(2).notin([self.table.a, self.table.b, self.table.c]), "2 NOT IN (`a`, `b`, `c`)"), ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): filtered = self.table[self.table.g.isin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` IN ('foo', 'bar')""" assert result == expected filtered = self.table[self.table.g.notin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` NOT IN ('foo', 'bar')""" assert result == expected
class TestUnions(unittest.TestCase): def setUp(self): self.con = MockConnection() table = self.con.table('functional_alltypes') self.t1 = (table[table.int_col > 0][ table.string_col.name('key'), table.float_col.cast('double').name('value')]) self.t2 = (table[table.int_col <= 0][table.string_col.name('key'), table.double_col.name('value')]) self.union1 = self.t1.union(self.t2) def test_union(self): result = to_sql(self.union1) expected = """\ SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION ALL SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0""" assert result == expected def test_union_distinct(self): union = self.t1.union(self.t2, distinct=True) result = to_sql(union) expected = """\ SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0""" assert result == expected def test_union_project_column(self): # select a column, get a subquery expr = self.union1[[self.union1.key]] result = to_sql(expr) expected = """SELECT `key` FROM ( SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value` FROM functional_alltypes WHERE `int_col` > 0 UNION ALL SELECT `string_col` AS `key`, `double_col` AS `value` FROM functional_alltypes WHERE `int_col` <= 0 ) t0""" assert result == expected def test_union_extract_with_block(self): pass def test_union_in_subquery(self): pass
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_analytic_exprs(self): t = self.table w = ibis.window(order_by=t.float_col) cases = [ (ibis.row_number().over(w), 'row_number() OVER (ORDER BY `float_col`) - 1'), (t.string_col.lag(), 'lag(`string_col`)'), (t.string_col.lag(2), 'lag(`string_col`, 2)'), (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'), (t.string_col.lead(), 'lead(`string_col`)'), (t.string_col.lead(2), 'lead(`string_col`, 2)'), (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'), (t.double_col.first(), 'first_value(`double_col`)'), (t.double_col.last(), 'last_value(`double_col`)'), # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))') ] self._check_expr_cases(cases)
class TestInsert(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = self.con.table('functional_alltypes') def test_select_basics(self): name = 'testing123456' expr = self.t.limit(10) select, _ = _get_select(expr) stmt = ddl.InsertSelect(name, select, database='foo') result = stmt.compile() expected = """\ INSERT INTO foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True) result = stmt.compile() expected = """\ INSERT OVERWRITE foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected def test_select_overwrite(self): pass
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_field_in_literals(self): cases = [(self.table.g.isin(["foo", "bar", "baz"]), "g IN ('foo', 'bar', 'baz')"), (self.table.g.notin(["foo", "bar", "baz"]), "g NOT IN ('foo', 'bar', 'baz')")] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (ibis.literal(2).isin([self.table.a, self.table.b, self.table.c]), '2 IN (a, b, c)'), (ibis.literal(2).notin([self.table.a, self.table.b, self.table.c]), '2 NOT IN (a, b, c)') ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): filtered = self.table[self.table.g.isin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE g IN ('foo', 'bar')""" assert result == expected filtered = self.table[self.table.g.notin(["foo", "bar"])] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE g NOT IN ('foo', 'bar')""" assert result == expected
class TestDistinct(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_distinct_basic(self): expr = self.table.distinct() assert isinstance(expr.op(), ops.Distinct) assert isinstance(expr, ir.TableExpr) assert expr.op().table is self.table expr = self.table.string_col.distinct() assert isinstance(expr.op(), ops.DistinctArray) assert isinstance(expr, ir.StringArray) # def test_distinct_array_interactions(self): # TODO # array cardinalities / shapes are likely to be different. # a = self.table.int_col.distinct() # b = self.table.bigint_col # self.assertRaises(ir.RelationError, a.__add__, b) def test_distinct_count(self): result = self.table.string_col.distinct().count() expected = self.table.string_col.nunique().name('count') assert_equal(result, expected) assert isinstance(result.op(), ops.CountDistinct) def test_distinct_unnamed_array_expr(self): table = ibis.table([('year', 'int32'), ('month', 'int32'), ('day', 'int32')], 'foo') # it works! expr = (ibis.literal('-') .join([table.year.cast('string'), table.month.cast('string'), table.day.cast('string')]) .distinct()) repr(expr) def test_distinct_count_numeric_types(self): table = self.table metric = (table.bigint_col.distinct().count() .name('unique_bigints')) table.group_by('string_col').aggregate(metric) def test_nunique(self): expr = self.table.string_col.nunique() assert isinstance(expr.op(), ops.CountDistinct) def test_project_with_distinct(self): pass
def test_memoize_database_table(self): con = MockConnection() table = con.table('test1') table2 = con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) formatted = repr(result) assert formatted.count('test1') == 1 assert formatted.count('test2') == 1
class TestInNotIn(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_field_in_literals(self): values = ['foo', 'bar', 'baz'] values_formatted = tuple(set(values)) cases = [ (self.table.g.isin(values), "`g` IN {}".format(values_formatted)), (self.table.g.notin(values), "`g` NOT IN {}".format(values_formatted)) ] self._check_expr_cases(cases) def test_literal_in_list(self): cases = [ (L(2).isin([self.table.a, self.table.b, self.table.c]), '2 IN (`a`, `b`, `c`)'), (L(2).notin([self.table.a, self.table.b, self.table.c]), '2 NOT IN (`a`, `b`, `c`)') ] self._check_expr_cases(cases) def test_isin_notin_in_select(self): values = ['foo', 'bar'] values_formatted = tuple(set(values)) filtered = self.table[self.table.g.isin(values)] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` IN {}""" assert result == expected.format(values_formatted) filtered = self.table[self.table.g.notin(values)] result = to_sql(filtered) expected = """SELECT * FROM alltypes WHERE `g` NOT IN {}""" assert result == expected.format(values_formatted)
class TestStringOps(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_lower_upper(self): lresult = self.table.g.lower() uresult = self.table.g.upper() assert isinstance(lresult, ir.StringArray) assert isinstance(uresult, ir.StringArray) assert isinstance(lresult.op(), ops.Lowercase) assert isinstance(uresult.op(), ops.Uppercase) lit = literal('FoO') lresult = lit.lower() uresult = lit.upper() assert isinstance(lresult, ir.StringScalar) assert isinstance(uresult, ir.StringScalar) def test_substr(self): lit = literal('FoO') result = self.table.g.substr(2, 4) lit_result = lit.substr(0, 2) assert isinstance(result, ir.StringArray) assert isinstance(lit_result, ir.StringScalar) op = result.op() assert isinstance(op, ops.Substring) start, length = op.args[1:] assert start.equals(literal(2)) assert length.equals(literal(4)) def test_left_right(self): result = self.table.g.left(5) expected = self.table.g.substr(0, 5) assert result.equals(expected) result = self.table.g.right(5) op = result.op() assert isinstance(op, ops.StrRight) assert op.args[1].equals(literal(5)) def test_length(self): lit = literal('FoO') result = self.table.g.length() lit_result = lit.length() assert isinstance(result, ir.Int32Array) assert isinstance(lit_result, ir.Int32Scalar) assert isinstance(result.op(), ops.StringLength) def test_join(self): dash = literal('-') expr = dash.join([self.table.f.cast('string'), self.table.g]) assert isinstance(expr, ir.StringArray) expr = dash.join([literal('ab'), literal('cd')]) assert isinstance(expr, ir.StringScalar) def test_contains(self): expr = self.table.g.contains('foo') expected = self.table.g.like('%foo%') assert_equal(expr, expected) self.assertRaises(Exception, lambda: 'foo' in self.table.g) def test_getitem_slice(self): cases = [ (self.table.g[:3], self.table.g.substr(0, 3)), (self.table.g[2:6], self.table.g.substr(2, 4)), ] for case, expected in cases: assert_equal(case, expected)
class TestWrapping(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): func = api.scalar_function(['string'], 'string', name='Tester') func.register('identity', 'udf_testing') result = func('hello world') assert (ibis.impala.compile(result) == "SELECT udf_testing.identity('hello world') AS `tmp`") def test_sql_generation_from_infoclass(self): func = api.wrap_udf('test.so', ['string'], 'string', 'info_test') repr(func) func.register('info_test', 'udf_testing') result = func('hello world') assert (ibis.impala.compile(result) == "SELECT udf_testing.info_test('hello world') AS `tmp`") def test_udf_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_udf([t], t, 'test') ibis_type = validate_type(t) expr = func(sv) assert type(expr) == type(ibis_type.scalar_type()(expr.op())) # noqa: E501, E721 expr = func(av) assert type(expr) == type(ibis_type.array_type()(expr.op())) # noqa: E501, E721 def test_uda_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_uda([t], t, 'test') ibis_type = validate_type(t) expr1 = func(sv) expr2 = func(sv) expected_type1 = type(ibis_type.scalar_type()(expr1.op())) expected_type2 = type(ibis_type.scalar_type()(expr2.op())) assert isinstance(expr1, expected_type1) assert isinstance(expr2, expected_type2) def test_decimal(self): func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test') expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalColumn def test_udf_invalid_typecasting(self): cases = [ ('int8', self.all_cols[:1], self.all_cols[1:]), ('int16', self.all_cols[:2], self.all_cols[2:]), ('int32', self.all_cols[:3], self.all_cols[3:]), ('int64', self.all_cols[:4], self.all_cols[4:]), ('boolean', [], self.all_cols[:8] + self.all_cols[9:]), # allowing double here for now ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('string', [], self.all_cols[:7] + self.all_cols[8:]), ('timestamp', [], self.all_cols[:-1]), ('decimal', [], self.all_cols[:4] + self.all_cols[7:]) ] for t, valid_casts, invalid_casts in cases: func = self._register_udf([t], 'int32', 'typecast') for expr in valid_casts: func(expr) for expr in invalid_casts: self.assertRaises(IbisTypeError, func, expr) def test_mult_args(self): func = self._register_udf(['int32', 'double', 'string', 'boolean', 'timestamp'], 'int64', 'mult_types') expr = func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ColumnExpr) expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10')) assert issubclass(type(expr), ir.ScalarExpr) def _register_udf(self, inputs, output, name): func = api.scalar_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func def _register_uda(self, inputs, output, name): func = api.aggregate_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table("functional_alltypes") def test_numeric_unary_builtins(self): # No argument functions functions = [ "abs", "ceil", "floor", "exp", "sqrt", "sign", ("log", "ln"), ("approx_median", "appx_median"), ("approx_nunique", "ndv"), "ln", "log2", "log10", "nullifzero", "zeroifnull", ] cases = [] for what in functions: if isinstance(what, tuple): ibis_name, sql_name = what else: ibis_name = sql_name = what for cname in ["double_col", "int_col"]: expr = getattr(self.table[cname], ibis_name)() cases.append((expr, "{0}({1})".format(sql_name, "`{0}`".format(cname)))) self._check_expr_cases(cases) def test_log_other_bases(self): cases = [(self.table.double_col.log(5), "log(`double_col`, 5)")] self._check_expr_cases(cases) def test_round(self): cases = [ (self.table.double_col.round(), "round(`double_col`)"), (self.table.double_col.round(0), "round(`double_col`, 0)"), (self.table.double_col.round(2), "round(`double_col`, 2)"), (self.table.double_col.round(self.table.tinyint_col), "round(`double_col`, `tinyint_col`)"), ] self._check_expr_cases(cases) def test_hash(self): expr = self.table.int_col.hash() assert isinstance(expr, ir.Int64Array) assert isinstance(self.table.int_col.sum().hash(), ir.Int64Scalar) cases = [(self.table.int_col.hash(), "fnv_hash(`int_col`)")] self._check_expr_cases(cases) def test_reduction_where(self): cond = self.table.bigint_col < 70 c = self.table.double_col tmp = "{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` " "ELSE NULL END)" cases = [ (c.sum(where=cond), tmp.format("sum")), (c.count(where=cond), tmp.format("count")), (c.mean(where=cond), tmp.format("avg")), (c.max(where=cond), tmp.format("max")), (c.min(where=cond), tmp.format("min")), (c.std(where=cond), tmp.format("stddev")), (c.std(where=cond, how="pop"), tmp.format("stddev_pop")), (c.var(where=cond), tmp.format("variance")), (c.var(where=cond, how="pop"), tmp.format("variance_pop")), ] self._check_expr_cases(cases) def test_reduction_invalid_where(self): condbad_literal = L("T") c = self.table.double_col for reduction in [c.sum, c.count, c.mean, c.max, c.min]: with self.assertRaises(TypeError): reduction(where=condbad_literal)
class TestBucketHistogram(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_bucket_to_case(self): buckets = [0, 10, 25, 50] expr1 = self.table.f.bucket(buckets) expected1 = """\ CASE WHEN (`f` >= 0) AND (`f` < 10) THEN 0 WHEN (`f` >= 10) AND (`f` < 25) THEN 1 WHEN (`f` >= 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr2 = self.table.f.bucket(buckets, close_extreme=False) expected2 = """\ CASE WHEN (`f` >= 0) AND (`f` < 10) THEN 0 WHEN (`f` >= 10) AND (`f` < 25) THEN 1 WHEN (`f` >= 25) AND (`f` < 50) THEN 2 ELSE NULL END""" expr3 = self.table.f.bucket(buckets, closed='right') expected3 = """\ CASE WHEN (`f` >= 0) AND (`f` <= 10) THEN 0 WHEN (`f` > 10) AND (`f` <= 25) THEN 1 WHEN (`f` > 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr4 = self.table.f.bucket(buckets, closed='right', close_extreme=False) expected4 = """\ CASE WHEN (`f` > 0) AND (`f` <= 10) THEN 0 WHEN (`f` > 10) AND (`f` <= 25) THEN 1 WHEN (`f` > 25) AND (`f` <= 50) THEN 2 ELSE NULL END""" expr5 = self.table.f.bucket(buckets, include_under=True) expected5 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 ELSE NULL END""" expr6 = self.table.f.bucket(buckets, include_under=True, include_over=True) expected6 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 WHEN `f` > 50 THEN 4 ELSE NULL END""" expr7 = self.table.f.bucket(buckets, close_extreme=False, include_under=True, include_over=True) expected7 = """\ CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` < 50) THEN 3 WHEN `f` >= 50 THEN 4 ELSE NULL END""" expr8 = self.table.f.bucket(buckets, closed='right', close_extreme=False, include_under=True) expected8 = """\ CASE WHEN `f` <= 0 THEN 0 WHEN (`f` > 0) AND (`f` <= 10) THEN 1 WHEN (`f` > 10) AND (`f` <= 25) THEN 2 WHEN (`f` > 25) AND (`f` <= 50) THEN 3 ELSE NULL END""" expr9 = self.table.f.bucket([10], closed='right', include_over=True, include_under=True) expected9 = """\ CASE WHEN `f` <= 10 THEN 0 WHEN `f` > 10 THEN 1 ELSE NULL END""" expr10 = self.table.f.bucket([10], include_over=True, include_under=True) expected10 = """\ CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END""" cases = [ (expr1, expected1), (expr2, expected2), (expr3, expected3), (expr4, expected4), (expr5, expected5), (expr6, expected6), (expr7, expected7), (expr8, expected8), (expr9, expected9), (expr10, expected10), ] self._check_expr_cases(cases) def test_cast_category_to_int_noop(self): # Because the bucket result is an integer, no explicit cast is # necessary expr = (self.table.f.bucket([10], include_over=True, include_under=True) .cast('int32')) expected = """\ CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END""" expr2 = (self.table.f.bucket([10], include_over=True, include_under=True) .cast('double')) expected2 = """\ CAST(CASE WHEN `f` < 10 THEN 0 WHEN `f` >= 10 THEN 1 ELSE NULL END AS double)""" self._check_expr_cases([(expr, expected), (expr2, expected2)]) def test_bucket_assign_labels(self): buckets = [0, 10, 25, 50] bucket = self.table.f.bucket(buckets, include_under=True) size = self.table.group_by(bucket.name('tier')).size() labelled = size.tier.label(['Under 0', '0 to 10', '10 to 25', '25 to 50'], nulls='error').name('tier2') expr = size[labelled, size['count']] expected = """\ SELECT CASE `tier` WHEN 0 THEN 'Under 0' WHEN 1 THEN '0 to 10' WHEN 2 THEN '10 to 25' WHEN 3 THEN '25 to 50' ELSE 'error' END AS `tier2`, `count` FROM ( SELECT CASE WHEN `f` < 0 THEN 0 WHEN (`f` >= 0) AND (`f` < 10) THEN 1 WHEN (`f` >= 10) AND (`f` < 25) THEN 2 WHEN (`f` >= 25) AND (`f` <= 50) THEN 3 ELSE NULL END AS `tier`, count(*) AS `count` FROM alltypes GROUP BY 1 ) t0""" result = to_sql(expr) assert result == expected self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c']) self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c', 'd', 'e'])
class TestSelectSQL(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_nameless_table(self): # Ensure that user gets some kind of sensible error nameless = api.table([('key', 'string')]) self.assertRaises(com.RelationError, to_sql, nameless) with_name = api.table([('key', 'string')], name='baz') result = to_sql(with_name) assert result == 'SELECT *\nFROM baz' def test_physical_table_reference_translate(self): # If an expression's table leaves all reference database tables, verify # we translate correctly table = self.con.table('alltypes') query = _get_query(table) sql_string = query.compile() expected = "SELECT *\nFROM alltypes" assert sql_string == expected def test_simple_join_formatting(self): t1 = self.con.table('star1') t2 = self.con.table('star2') pred = t1['foo_id'] == t2['foo_id'] pred2 = t1['bar_id'] == t2['foo_id'] cases = [ (t1.inner_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), (t1.left_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 LEFT OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), (t1.outer_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 FULL OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), # multiple predicates (t1.inner_join(t2, [pred, pred2])[[t1]], """SELECT t0.* FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` AND t0.`bar_id` = t1.`foo_id`"""), ] for expr, expected_sql in cases: result_sql = to_sql(expr) assert result_sql == expected_sql def test_multiple_join_cases(self): t1 = self.con.table('star1') t2 = self.con.table('star2') t3 = self.con.table('star3') predA = t1['foo_id'] == t2['foo_id'] predB = t1['bar_id'] == t3['bar_id'] what = (t1.left_join(t2, [predA]).inner_join(t3, [predB]).projection( [t1, t2['value1'], t3['value2']])) result_sql = to_sql(what) expected_sql = """SELECT t0.*, t1.`value1`, t2.`value2` FROM star1 t0 LEFT OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` INNER JOIN star3 t2 ON t0.`bar_id` = t2.`bar_id`""" assert result_sql == expected_sql def test_join_between_joins(self): t1 = api.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double'), ], 'first') t2 = api.table([ ('key1', 'string'), ('value2', 'double'), ], 'second') t3 = api.table([ ('key2', 'string'), ('key3', 'string'), ('value3', 'double'), ], 'third') t4 = api.table([('key3', 'string'), ('value4', 'double')], 'fourth') left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2] right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4] joined = left.inner_join(right, [('key2', 'key2')]) # At one point, the expression simplification was resulting in bad refs # here (right.value3 referencing the table inside the right join) exprs = [left, right.value3, right.value4] projected = joined.projection(exprs) result = to_sql(projected) expected = """SELECT t0.*, t1.`value3`, t1.`value4` FROM ( SELECT t2.*, t3.`value2` FROM `first` t2 INNER JOIN second t3 ON t2.`key1` = t3.`key1` ) t0 INNER JOIN ( SELECT t2.*, t3.`value4` FROM third t2 INNER JOIN fourth t3 ON t2.`key3` = t3.`key3` ) t1 ON t0.`key2` = t1.`key2`""" assert result == expected def test_join_just_materialized(self): t1 = self.con.table('tpch_nation') t2 = self.con.table('tpch_region') t3 = self.con.table('tpch_customer') # GH #491 joined = (t1.inner_join(t2, t1.n_regionkey == t2.r_regionkey).inner_join( t3, t1.n_nationkey == t3.c_nationkey)) result = to_sql(joined) expected = """SELECT * FROM tpch_nation t0 INNER JOIN tpch_region t1 ON t0.`n_regionkey` = t1.`r_regionkey` INNER JOIN tpch_customer t2 ON t0.`n_nationkey` = t2.`c_nationkey`""" assert result == expected result = to_sql(joined.materialize()) assert result == expected def test_join_no_predicates_for_impala(self): # Impala requires that joins without predicates be written explicitly # as CROSS JOIN, since result sets can accidentally get too large if a # query is executed before predicates are written t1 = self.con.table('star1') t2 = self.con.table('star2') joined2 = t1.cross_join(t2)[[t1]] expected = """SELECT t0.* FROM star1 t0 CROSS JOIN star2 t1""" result2 = to_sql(joined2) assert result2 == expected for jtype in ['inner_join', 'left_join', 'outer_join']: joined = getattr(t1, jtype)(t2)[[t1]] result = to_sql(joined) assert result == expected def test_semi_anti_joins(self): t1 = self.con.table('star1') t2 = self.con.table('star2') joined = t1.semi_join(t2, [t1.foo_id == t2.foo_id])[[t1]] result = to_sql(joined) expected = """SELECT t0.* FROM star1 t0 LEFT SEMI JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected joined = t1.anti_join(t2, [t1.foo_id == t2.foo_id])[[t1]] result = to_sql(joined) expected = """SELECT t0.* FROM star1 t0 LEFT ANTI JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_self_reference_simple(self): t1 = self.con.table('star1') result_sql = to_sql(t1.view()) expected_sql = "SELECT *\nFROM star1" assert result_sql == expected_sql def test_join_self_reference(self): t1 = self.con.table('star1') t2 = t1.view() result = t1.inner_join(t2, [t1.foo_id == t2.bar_id])[[t1]] result_sql = to_sql(result) expected_sql = """SELECT t0.* FROM star1 t0 INNER JOIN star1 t1 ON t0.`foo_id` = t1.`bar_id`""" assert result_sql == expected_sql def test_join_projection_subquery_broken_alias(self): # From an observed bug, derived from tpch tables geo = (nation.inner_join( region, [('n_regionkey', 'r_regionkey')])[nation.n_nationkey, nation.n_name.name('nation'), region.r_name.name('region')]) expr = (geo.inner_join(customer, [('n_nationkey', 'c_nationkey')])[customer, geo]) result = to_sql(expr) expected = """SELECT t1.*, t0.* FROM ( SELECT t2.`n_nationkey`, t2.`n_name` AS `nation`, t3.`r_name` AS `region` FROM nation t2 INNER JOIN region t3 ON t2.`n_regionkey` = t3.`r_regionkey` ) t0 INNER JOIN customer t1 ON t0.`n_nationkey` = t1.`c_nationkey`""" assert result == expected def test_where_simple_comparisons(self): t1 = self.con.table('star1') what = t1.filter([t1.f > 0, t1.c < t1.f * 2]) result = to_sql(what) expected = """SELECT * FROM star1 WHERE `f` > 0 AND `c` < (`f` * 2)""" assert result == expected def test_where_in_array_literal(self): # e.g. # where string_col in (v1, v2, v3) raise unittest.SkipTest def test_where_with_join(self): t1 = self.con.table('star1') t2 = self.con.table('star2') # This also tests some cases of predicate pushdown what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]).projection( [t1, t2.value1, t2.value3]).filter([t1.f > 0, t2.value3 < 1000])) what2 = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]).filter( [t1.f > 0, t2.value3 < 1000]).projection([t1, t2.value1, t2.value3])) expected_sql = """SELECT t0.*, t1.`value1`, t1.`value3` FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` WHERE t0.`f` > 0 AND t1.`value3` < 1000""" result_sql = to_sql(what) assert result_sql == expected_sql result2_sql = to_sql(what2) assert result2_sql == expected_sql def test_where_no_pushdown_possible(self): t1 = self.con.table('star1') t2 = self.con.table('star2') joined = (t1.inner_join( t2, [t1.foo_id == t2.foo_id])[t1, (t1.f - t2.value1).name('diff')]) filtered = joined[joined.diff > 1] # TODO: I'm not sure if this is exactly what we want expected_sql = """SELECT * FROM ( SELECT t0.*, t0.`f` - t1.`value1` AS `diff` FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` WHERE t0.`f` > 0 AND t1.`value3` < 1000 ) WHERE `diff` > 1""" raise unittest.SkipTest result_sql = to_sql(filtered) assert result_sql == expected_sql def test_where_with_between(self): t = self.con.table('alltypes') what = t.filter([t.a > 0, t.f.between(0, 1)]) result = to_sql(what) expected = """SELECT * FROM alltypes WHERE `a` > 0 AND `f` BETWEEN 0 AND 1""" assert result == expected def test_where_analyze_scalar_op(self): # root cause of #310 table = self.con.table('functional_alltypes') expr = (table.filter([ table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.month(3)), table.timestamp_col < (ibis.now() + ibis.day(10)) ]).count()) result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND `timestamp_col` < days_add(now(), 10)""" assert result == expected def test_simple_aggregate_query(self): t1 = self.con.table('star1') cases = [(t1.aggregate([t1['f'].sum().name('total')], [t1['foo_id']]), """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1"""), (t1.aggregate([t1['f'].sum().name('total')], ['foo_id', 'bar_id']), """SELECT `foo_id`, `bar_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1, 2""")] for expr, expected_sql in cases: result_sql = to_sql(expr) assert result_sql == expected_sql def test_aggregate_having(self): # Filtering post-aggregation predicate t1 = self.con.table('star1') total = t1.f.sum().name('total') metrics = [total] expr = t1.aggregate(metrics, by=['foo_id'], having=[total > 10]) result = to_sql(expr) expected = """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 HAVING sum(`f`) > 10""" assert result == expected expr = t1.aggregate(metrics, by=['foo_id'], having=[t1.count() > 100]) result = to_sql(expr) expected = """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 HAVING count(*) > 100""" assert result == expected def test_aggregate_table_count_metric(self): expr = self.con.table('star1').count() result = to_sql(expr) expected = """SELECT count(*) AS `tmp` FROM star1""" assert result == expected # count on more complicated table region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') join_expr = region.r_regionkey == nation.n_regionkey joined = region.inner_join(nation, join_expr) table_ref = joined[nation, region.r_name.name('region')] expr = table_ref.count() result = to_sql(expr) expected = """SELECT count(*) AS `tmp` FROM ( SELECT t2.*, t1.`r_name` AS `region` FROM tpch_region t1 INNER JOIN tpch_nation t2 ON t1.`r_regionkey` = t2.`n_regionkey` ) t0""" assert result == expected def test_expr_template_field_name_binding(self): # Given an expression with no concrete links to actual database tables, # indicate a mapping between the distinct unbound table leaves of the # expression and some database tables with compatible schemas but # potentially different column names pass def test_no_aliases_needed(self): table = api.table([('key1', 'string'), ('key2', 'string'), ('value', 'double')]) expr = table.aggregate([table['value'].sum().name('total')], by=['key1', 'key2']) query = _get_query(expr) context = query.context assert not context.need_aliases() def test_table_names_overlap_default_aliases(self): # see discussion in #104; this actually is not needed for query # correctness, and only makes the generated SQL nicer raise unittest.SkipTest t0 = api.table([('key', 'string'), ('v1', 'double')], 't1') t1 = api.table([('key', 'string'), ('v2', 'double')], 't0') expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2] result = to_sql(expr) expected = """\ SELECT t2.`key`, t2.`v1`, t3.`v2` FROM t0 t2 INNER JOIN t1 t3 ON t2.`key` = t3.`key`""" assert result == expected def test_context_aliases_multiple_join(self): t1 = self.con.table('star1') t2 = self.con.table('star2') t3 = self.con.table('star3') expr = (t1.left_join(t2, [t1['foo_id'] == t2['foo_id']]).inner_join( t3, [t1['bar_id'] == t3['bar_id']])[[t1, t2['value1'], t3['value2']]]) query = _get_query(expr) context = query.context assert context.get_alias(t1) == 't0' assert context.get_alias(t2) == 't1' assert context.get_alias(t3) == 't2' def test_fuse_projections(self): table = api.table([('foo', 'int32'), ('bar', 'int64'), ('value', 'double')], name='tbl') # Cases where we project in both cases using the base table reference f1 = (table['foo'] + table['bar']).name('baz') pred = table['value'] > 0 table2 = table[table, f1] table2_filtered = table2[pred] f2 = (table2['foo'] * 2).name('qux') f3 = (table['foo'] * 2).name('qux') table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl""" ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl WHERE `value` > 0""" table3_sql = to_sql(table3) table3_filt_sql = to_sql(table3_filtered) assert table3_sql == ex_sql assert table3_filt_sql == ex_sql2 # Use the intermediate table refs table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) def test_bug_project_multiple_times(self): # 108 customer = self.con.table('tpch_customer') nation = self.con.table('tpch_nation') region = self.con.table('tpch_region') joined = (customer.inner_join( nation, [customer.c_nationkey == nation.n_nationkey]).inner_join( region, [nation.n_regionkey == region.r_regionkey])) proj1 = [customer, nation.n_name, region.r_name] step1 = joined[proj1] topk_by = step1.c_acctbal.cast('double').sum() pred = step1.n_name.topk(10, by=topk_by) proj_exprs = [step1.c_name, step1.r_name, step1.n_name] step2 = step1[pred] expr = step2.projection(proj_exprs) # it works! result = to_sql(expr) expected = """\ SELECT `c_name`, `r_name`, `n_name` FROM ( SELECT t1.*, t2.`n_name`, t3.`r_name` FROM tpch_customer t1 INNER JOIN tpch_nation t2 ON t1.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_region t3 ON t2.`n_regionkey` = t3.`r_regionkey` LEFT SEMI JOIN ( SELECT t2.`n_name`, sum(CAST(t1.`c_acctbal` AS double)) AS `sum` FROM tpch_customer t1 INNER JOIN tpch_nation t2 ON t1.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_region t3 ON t2.`n_regionkey` = t3.`r_regionkey` GROUP BY 1 ORDER BY `sum` DESC LIMIT 10 ) t4 ON t2.`n_name` = t4.`n_name` ) t0""" assert result == expected def test_aggregate_projection_subquery(self): t = self.con.table('alltypes') proj = t[t.f > 0][t, (t.a + t.b).name('foo')] def agg(x): return x.aggregate([x.foo.sum().name('foo total')], by=['g']) # predicate gets pushed down filtered = proj[proj.g == 'bar'] result = to_sql(filtered) expected = """SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 AND `g` = 'bar'""" assert result == expected agged = agg(filtered) result = to_sql(agged) expected = """SELECT `g`, sum(`foo`) AS `foo total` FROM ( SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 AND `g` = 'bar' ) t0 GROUP BY 1""" assert result == expected # Pushdown is not possible (in Impala, Postgres, others) agged2 = agg(proj[proj.foo < 10]) result = to_sql(agged2) expected = """SELECT t0.`g`, sum(t0.`foo`) AS `foo total` FROM ( SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 ) t0 WHERE t0.`foo` < 10 GROUP BY 1""" assert result == expected def test_subquery_aliased(self): t1 = self.con.table('star1') t2 = self.con.table('star2') agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id']) what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id])[agged, t2.value1]) result = to_sql(what) expected = """SELECT t0.*, t1.`value1` FROM ( SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 ) t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_double_nested_subquery_no_aliases(self): # We don't require any table aliasing anywhere t = api.table([('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value', 'double')], 'foo_table') agg1 = t.aggregate([t.value.sum().name('total')], by=['key1', 'key2', 'key3']) agg2 = agg1.aggregate([agg1.total.sum().name('total')], by=['key1', 'key2']) agg3 = agg2.aggregate([agg2.total.sum().name('total')], by=['key1']) result = to_sql(agg3) expected = """SELECT `key1`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total` FROM foo_table GROUP BY 1, 2, 3 ) t1 GROUP BY 1, 2 ) t0 GROUP BY 1""" assert result == expected def test_aggregate_projection_alias_bug(self): # Observed in use t1 = self.con.table('star1') t2 = self.con.table('star2') what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])[[t1, t2.value1]]) what = what.aggregate([what.value1.sum().name('total')], by=[what.foo_id]) # TODO: Not fusing the aggregation with the projection yet result = to_sql(what) expected = """SELECT `foo_id`, sum(`value1`) AS `total` FROM ( SELECT t1.*, t2.`value1` FROM star1 t1 INNER JOIN star2 t2 ON t1.`foo_id` = t2.`foo_id` ) t0 GROUP BY 1""" assert result == expected def test_aggregate_fuse_with_projection(self): # see above test case pass def test_subquery_used_for_self_join(self): # There could be cases that should look in SQL like # WITH t0 as (some subquery) # select ... # from t0 t1 # join t0 t2 # on t1.kind = t2.subkind # ... # However, the Ibis code will simply have an expression (projection or # aggregation, say) built on top of the subquery expression, so we need # to extract the subquery unit (we see that it appears multiple times # in the tree). t = self.con.table('alltypes') agged = t.aggregate([t.f.sum().name('total')], by=['g', 'a', 'b']) view = agged.view() metrics = [(agged.total - view.total).max().name('metric')] reagged = (agged.inner_join(view, [agged.a == view.b]).aggregate( metrics, by=[agged.g])) result = to_sql(reagged) expected = """WITH t0 AS ( SELECT `g`, `a`, `b`, sum(`f`) AS `total` FROM alltypes GROUP BY 1, 2, 3 ) SELECT t0.`g`, max(t0.`total` - t1.`total`) AS `metric` FROM t0 INNER JOIN t0 t1 ON t0.`a` = t1.`b` GROUP BY 1""" assert result == expected def test_subquery_factor_correlated_subquery(self): # #173, #183 and other issues region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') customer = self.con.table('tpch_customer') orders = self.con.table('tpch_orders') fields_of_interest = [ customer, region.r_name.name('region'), orders.o_totalprice.name('amount'), orders.o_orderdate.cast('timestamp').name('odate') ] tpch = (region.join( nation, region.r_regionkey == nation.n_regionkey).join( customer, customer.c_nationkey == nation.n_nationkey).join( orders, orders.o_custkey == customer.c_custkey)[fields_of_interest] ) # Self-reference + correlated subquery complicates things t2 = tpch.view() conditional_avg = t2[t2.region == tpch.region].amount.mean() amount_filter = tpch.amount > conditional_avg expr = tpch[amount_filter].limit(10) result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT t5.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`, CAST(t3.`o_orderdate` AS timestamp) AS `odate` FROM tpch_region t1 INNER JOIN tpch_nation t2 ON t1.`r_regionkey` = t2.`n_regionkey` INNER JOIN tpch_customer t5 ON t5.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_orders t3 ON t3.`o_custkey` = t5.`c_custkey` ) SELECT t0.* FROM t0 WHERE t0.`amount` > ( SELECT avg(t4.`amount`) AS `tmp` FROM t0 t4 WHERE t4.`region` = t0.`region` ) LIMIT 10""" assert result == expected def test_self_join_subquery_distinct_equal(self): region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') j1 = (region.join(nation, region.r_regionkey == nation.n_regionkey)[region, nation]) j2 = (region.join( nation, region.r_regionkey == nation.n_regionkey)[region, nation].view()) expr = (j1.join(j2, j1.r_regionkey == j2.r_regionkey)[j1.r_name, j2.n_name]) result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT t2.*, t3.* FROM tpch_region t2 INNER JOIN tpch_nation t3 ON t2.`r_regionkey` = t3.`n_regionkey` ) SELECT t0.`r_name`, t1.`n_name` FROM t0 INNER JOIN t0 t1 ON t0.`r_regionkey` = t1.`r_regionkey`""" assert result == expected def test_limit_with_self_join(self): t = self.con.table('functional_alltypes') t2 = t.view() expr = t.join(t2, t.tinyint_col < t2.timestamp_col.minute()).count() # it works result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes t0 INNER JOIN functional_alltypes t1 ON t0.`tinyint_col` < extract(t1.`timestamp_col`, 'minute')""" assert result == expected def test_cte_factor_distinct_but_equal(self): t = self.con.table('alltypes') tt = self.con.table('alltypes') expr1 = t.group_by('g').aggregate(t.f.sum().name('metric')) expr2 = tt.group_by('g').aggregate(tt.f.sum().name('metric')).view() expr = expr1.join(expr2, expr1.g == expr2.g)[[expr1]] result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT `g`, sum(`f`) AS `metric` FROM alltypes GROUP BY 1 ) SELECT t0.* FROM t0 INNER JOIN t0 t1 ON t0.`g` = t1.`g`""" assert result == expected def test_tpch_self_join_failure(self): # duplicating the integration test here region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') customer = self.con.table('tpch_customer') orders = self.con.table('tpch_orders') fields_of_interest = [ region.r_name.name('region'), nation.n_name.name('nation'), orders.o_totalprice.name('amount'), orders.o_orderdate.cast('timestamp').name('odate') ] joined_all = (region.join( nation, region.r_regionkey == nation.n_regionkey).join( customer, customer.c_nationkey == nation.n_nationkey).join( orders, orders.o_custkey == customer.c_custkey)[fields_of_interest] ) year = joined_all.odate.year().name('year') total = joined_all.amount.sum().cast('double').name('total') annual_amounts = (joined_all.group_by(['region', year]).aggregate(total)) current = annual_amounts prior = annual_amounts.view() yoy_change = (current.total - prior.total).name('yoy_change') yoy = (current.join(prior, current.year == (prior.year - 1))[current.region, current.year, yoy_change]) to_sql(yoy) def test_extract_subquery_nested_lower(self): # We may have a join between two tables requiring subqueries, and # buried inside these there may be a common subquery. Let's test that # we find it and pull it out to the top level to avoid repeating # ourselves. pass def test_subquery_in_filter_predicate(self): # E.g. comparing against some scalar aggregate value. See Ibis #43 t1 = self.con.table('star1') pred = t1.f > t1.f.mean() expr = t1[pred] # This brought out another expression rewriting bug, since the filtered # table isn't found elsewhere in the expression. pred2 = t1.f > t1[t1.foo_id == 'foo'].f.mean() expr2 = t1[pred2] result = to_sql(expr) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT avg(`f`) AS `tmp` FROM star1 )""" assert result == expected result = to_sql(expr2) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT avg(`f`) AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected def test_filter_subquery_derived_reduction(self): t1 = self.con.table('star1') # Reduction can be nested inside some scalar expression pred3 = t1.f > t1[t1.foo_id == 'foo'].f.mean().log() pred4 = t1.f > (t1[t1.foo_id == 'foo'].f.mean().log() + 1) expr3 = t1[pred3] result = to_sql(expr3) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT ln(avg(`f`)) AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected expr4 = t1[pred4] result = to_sql(expr4) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT ln(avg(`f`)) + 1 AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected def test_topk_operation_to_semi_join(self): # TODO: top K with filter in place table = api.table([ ('foo', 'string'), ('bar', 'string'), ('city', 'string'), ('v1', 'double'), ('v2', 'double'), ], 'tbl') what = table.city.topk(10, by=table.v2.mean()) filtered = table[what] query = to_sql(filtered) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, avg(`v2`) AS `mean` FROM tbl GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected # Test the default metric (count) what = table.city.topk(10) filtered2 = table[what] query = to_sql(filtered2) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, count(`city`) AS `count` FROM tbl GROUP BY 1 ORDER BY `count` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected def test_topk_predicate_pushdown_bug(self): # Observed on TPCH data cplusgeo = (customer.inner_join( nation, [customer.c_nationkey == nation.n_nationkey]).inner_join( region, [nation.n_regionkey == region.r_regionkey])[customer, nation.n_name, region.r_name]) pred = cplusgeo.n_name.topk(10, by=cplusgeo.c_acctbal.sum()) expr = cplusgeo.filter([pred]) result = to_sql(expr) expected = """\ SELECT t0.*, t1.`n_name`, t2.`r_name` FROM customer t0 INNER JOIN nation t1 ON t0.`c_nationkey` = t1.`n_nationkey` INNER JOIN region t2 ON t1.`n_regionkey` = t2.`r_regionkey` LEFT SEMI JOIN ( SELECT t1.`n_name`, sum(t0.`c_acctbal`) AS `sum` FROM customer t0 INNER JOIN nation t1 ON t0.`c_nationkey` = t1.`n_nationkey` INNER JOIN region t2 ON t1.`n_regionkey` = t2.`r_regionkey` GROUP BY 1 ORDER BY `sum` DESC LIMIT 10 ) t3 ON t1.`n_name` = t3.`n_name`""" assert result == expected def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) expr = t[delay_filter].group_by('origin').size() result = to_sql(expr) expected = """\ SELECT t0.`origin`, count(*) AS `count` FROM airlines t0 LEFT SEMI JOIN ( SELECT `dest`, avg(`arrdelay`) AS `mean` FROM airlines WHERE `dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`dest` = t1.`dest` WHERE t0.`dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1""" assert result == expected def test_topk_to_aggregate(self): t = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') top = t.dest.topk(10, by=t.arrdelay.mean()) result = to_sql(top) expected = to_sql(top.to_aggregation()) assert result == expected def test_bottomk(self): pass def test_topk_antijoin(self): # Get the "other" category somehow pass def test_case_in_projection(self): t = self.con.table('alltypes') expr = (t.g.case().when('foo', 'bar').when('baz', 'qux').else_('default').end()) expr2 = (api.case().when(t.g == 'foo', 'bar').when(t.g == 'baz', t.g).end()) proj = t[expr.name('col1'), expr2.name('col2'), t] result = to_sql(proj) expected = """SELECT CASE `g` WHEN 'foo' THEN 'bar' WHEN 'baz' THEN 'qux' ELSE 'default' END AS `col1`, CASE WHEN `g` = 'foo' THEN 'bar' WHEN `g` = 'baz' THEN `g` ELSE NULL END AS `col2`, * FROM alltypes""" assert result == expected def test_identifier_quoting(self): data = api.table([('date', 'int32'), ('explain', 'string')], 'table') expr = data[data.date.name('else'), data.explain.name('join')] result = to_sql(expr) expected = """SELECT `date` AS `else`, `explain` AS `join` FROM `table`""" assert result == expected
class TestASTBuilder(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_ast_with_projection_join_filter(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) result = joined[[table3, table2['value']]] ast = build_ast(result) stmt = ast.queries[0] def foo(): table3 = table[filter_pred] joined = table2.inner_join(table3, [join_pred]) result = joined[[table3, table2['value']]] return result assert len(stmt.select_set) == 2 assert len(stmt.where) == 1 assert stmt.where[0] is filter_pred # Check that the join has been rebuilt to only include the root tables tbl = stmt.table_set tbl_node = tbl.op() assert isinstance(tbl_node, ops.InnerJoin) assert tbl_node.left is table2 assert tbl_node.right is table # table expression substitution has been made in the predicate assert tbl_node.predicates[0].equals(table['g'] == table2['key']) def test_ast_with_aggregation_join_filter(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) ast = build_ast(result) stmt = ast.queries[0] # hoisted metrics ex_metrics = [(table['f'] - table2['value']).mean().name('foo'), table['f'].sum().name('bar')] ex_by = [table['g'], table2['key']] # hoisted join and aggregate expected_table_set = \ table2.inner_join(table, [table['g'] == table2['key']]) assert stmt.table_set.equals(expected_table_set) # Check various exprs for res, ex in zip(stmt.select_set, ex_by + ex_metrics): assert res.equals(ex) for res, ex in zip(stmt.group_by, ex_by): assert stmt.select_set[res].equals(ex) # Check we got the filter assert len(stmt.where) == 1 assert stmt.where[0].equals(filter_pred) def test_sort_by(self): table = self.con.table('star1') what = table.sort_by('f') result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `f`""" assert result == expected what = table.sort_by(('f', 0)) result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `f` DESC""" assert result == expected what = table.sort_by(['c', ('f', 0)]) result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `c`, `f` DESC""" assert result == expected def test_limit(self): table = self.con.table('star1').limit(10) result = to_sql(table) expected = """SELECT * FROM star1 LIMIT 10""" assert result == expected table = self.con.table('star1').limit(10, offset=5) result = to_sql(table) expected = """SELECT * FROM star1 LIMIT 10 OFFSET 5""" assert result == expected # Put the limit in a couple places in the stack table = self.con.table('star1') table = table[table.f > 0].limit(10) result = to_sql(table) expected = """SELECT * FROM star1 WHERE `f` > 0 LIMIT 10""" assert result == expected table = self.con.table('star1') # Semantically, this should produce a subquery table = table.limit(10) table = table[table.f > 0] result2 = to_sql(table) expected2 = """SELECT * FROM ( SELECT * FROM star1 LIMIT 10 ) t0 WHERE `f` > 0""" assert result2 == expected2 def test_join_with_limited_table(self): t1 = self.con.table('star1') t2 = self.con.table('star2') limited = t1.limit(100) joined = (limited.inner_join(t2, [limited.foo_id == t2.foo_id]) [[limited]]) result = to_sql(joined) expected = """SELECT t0.* FROM ( SELECT * FROM star1 LIMIT 100 ) t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_sort_by_on_limit_yield_subquery(self): # x.limit(...).sort_by(...) # is semantically different from # x.sort_by(...).limit(...) # and will often yield different results t = self.con.table('functional_alltypes') expr = (t.group_by('string_col') .aggregate([t.count().name('nrows')]) .limit(5) .sort_by('string_col')) result = to_sql(expr) expected = """SELECT * FROM ( SELECT `string_col`, count(*) AS `nrows` FROM functional_alltypes GROUP BY 1 LIMIT 5 ) t0 ORDER BY `string_col`""" assert result == expected def test_multiple_limits(self): t = self.con.table('functional_alltypes') expr = t.limit(20).limit(10) stmt = build_ast(expr).queries[0] assert stmt.limit['n'] == 10 def test_top_convenience(self): # x.top(10, by=field) # x.top(10, by=[field1, field2]) pass def test_self_aggregate_in_predicate(self): # Per ibis #43 pass
class TestWrapping(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): func = api.scalar_function(['string'], 'string', name='Tester') func.register('identity', 'udf_testing') result = func('hello world') assert result == "SELECT udf_testing.identity('hello world')" def test_sql_generation_from_infoclass(self): func = api.wrap_udf('test.so', ['string'], 'string', 'info_test') repr(func) func.register('info_test', 'udf_testing') result = func('hello world') assert result == "SELECT udf_testing.info_test('hello world')" def test_udf_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_udf([t], t, 'test') ibis_type = validate_type(t) expr = func(sv) assert type(expr) == ibis_type.scalar_type() expr = func(av) assert type(expr) == ibis_type.array_type() def test_uda_primitive_output_types(self): types = [ ('boolean', True, self.b), ('int8', 1, self.i8), ('int16', 1, self.i16), ('int32', 1, self.i32), ('int64', 1, self.i64), ('float', 1.0, self.f), ('double', 1.0, self.d), ('string', '1', self.s), ('timestamp', ibis.timestamp('1961-04-10'), self.t) ] for t, sv, av in types: func = self._register_uda([t], t, 'test') ibis_type = validate_type(t) expr1 = func(sv) expr2 = func(sv) assert isinstance(expr1, ibis_type.scalar_type()) assert isinstance(expr2, ibis_type.scalar_type()) def test_decimal(self): func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test') expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalArray def test_udf_invalid_typecasting(self): cases = [ ('int8', self.all_cols[:1], self.all_cols[1:]), ('int16', self.all_cols[:2], self.all_cols[2:]), ('int32', self.all_cols[:3], self.all_cols[3:]), ('int64', self.all_cols[:4], self.all_cols[4:]), ('boolean', [], self.all_cols[:8] + self.all_cols[9:]), # allowing double here for now ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]), ('string', [], self.all_cols[:7] + self.all_cols[8:]), ('timestamp', [], self.all_cols[:-1]), ('decimal', [], self.all_cols[:4] + self.all_cols[7:]) ] for t, valid_casts, invalid_casts in cases: func = self._register_udf([t], 'int32', 'typecast') for expr in valid_casts: func(expr) for expr in invalid_casts: self.assertRaises(IbisTypeError, func, expr) def test_mult_args(self): func = self._register_udf(['int32', 'double', 'string', 'boolean', 'timestamp'], 'int64', 'mult_types') expr = func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ArrayExpr) expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10')) assert issubclass(type(expr), ir.ScalarExpr) def _register_udf(self, inputs, output, name): func = api.scalar_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func def _register_uda(self, inputs, output, name): func = api.aggregate_function(inputs, output, name=name) func.register(name, 'ibis_testing') return func
class TestDecimal(unittest.TestCase): def setUp(self): self.con = MockConnection() self.lineitem = self.con.table('tpch_lineitem') def test_type_metadata(self): col = self.lineitem.l_extendedprice assert isinstance(col, ir.DecimalArray) assert col._precision == 12 assert col._scale == 2 def test_cast_scalar_to_decimal(self): val = api.literal('1.2345') casted = val.cast('decimal(15,5)') assert isinstance(casted, ir.DecimalScalar) assert casted._precision == 15 assert casted._scale == 5 def test_decimal_aggregate_function_behavior(self): # From the Impala documentation: "The result of an aggregate function # such as MAX(), SUM(), or AVG() on DECIMAL values is promoted to a # scale of 38, with the same precision as the underlying column. Thus, # the result can represent the largest possible value at that # particular precision." col = self.lineitem.l_extendedprice functions = ['sum', 'mean', 'max', 'min'] for func_name in functions: result = getattr(col, func_name)() assert isinstance(result, ir.DecimalScalar) assert result._precision == col._precision assert result._scale == 38 def test_where(self): table = self.lineitem q = table.l_quantity expr = api.where(table.l_discount > 0, q * table.l_discount, api.null) assert isinstance(expr, ir.DecimalArray) expr = api.where(table.l_discount > 0, (q * table.l_discount).sum(), api.null) assert isinstance(expr, ir.DecimalArray) expr = api.where(table.l_discount.sum() > 0, (q * table.l_discount).sum(), api.null) assert isinstance(expr, ir.DecimalScalar) def test_fillna(self): expr = self.lineitem.l_extendedprice.fillna(0) assert isinstance(expr, ir.DecimalArray) expr = self.lineitem.l_extendedprice.fillna(self.lineitem.l_quantity) assert isinstance(expr, ir.DecimalArray) def test_precision_scale(self): col = self.lineitem.l_extendedprice p = col.precision() s = col.scale() assert isinstance(p, ir.IntegerValue) assert isinstance(p.op(), ops.DecimalPrecision) assert isinstance(s, ir.IntegerValue) assert isinstance(s.op(), ops.DecimalScale) def test_invalid_precision_scale_combo(self): pass
class TestCreateTable(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = t = self.con.table('functional_alltypes') self.expr = t[t.bigint_col > 0] def test_create_external_table_as(self): path = '/path/to/table' select = build_ast(self.con.table('test1')).queries[0] statement = ddl.CTAS('another_table', select, external=True, can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` STORED AS PARQUET LOCATION '{0}' AS SELECT * FROM test1""".format(path) assert result == expected def test_create_table_with_location(self): path = '/path/to/table' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableWithSchema('another_table', schema, ddl.NoFormat(), can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) LOCATION '{0}'""".format(path) assert result == expected def test_create_table_like_parquet(self): directory = '/path/to/' path = '/path/to/parquetfile' statement = ddl.CreateTableParquet('new_table', directory, example_file=path, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE PARQUET '{0}' STORED AS PARQUET LOCATION '{1}'""".format(path, directory) assert result == expected def test_create_table_parquet_like_other(self): # alternative to "LIKE PARQUET" directory = '/path/to/' example_table = 'db.other' statement = ddl.CreateTableParquet('new_table', directory, example_table=example_table, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE {0} STORED AS PARQUET LOCATION '{1}'""".format(example_table, directory) assert result == expected def test_create_table_parquet_with_schema(self): directory = '/path/to/' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableParquet('new_table', directory, schema=schema, external=True, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(directory) assert result == expected def test_create_table_delimited(self): path = '/path/to/files/' schema = ibis.schema([('a', 'string'), ('b', 'int32'), ('c', 'double'), ('d', 'decimal(12,2)')]) stmt = ddl.CreateTableDelimited('new_table', path, schema, delimiter='|', escapechar='\\', lineterminator='\0', database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format(path) assert result == expected def test_create_external_table_avro(self): path = '/path/to/files/' avro_schema = { 'fields': [{ 'name': 'a', 'type': 'string' }, { 'name': 'b', 'type': 'int' }, { 'name': 'c', 'type': 'double' }, { "type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2, 'name': 'd' }], 'name': 'my_record', 'type': 'record' } stmt = ddl.CreateTableAvro('new_table', path, avro_schema, database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` STORED AS AVRO LOCATION '%s' TBLPROPERTIES ( 'avro.schema.literal'='{ "fields": [ { "name": "a", "type": "string" }, { "name": "b", "type": "int" }, { "name": "c", "type": "double" }, { "logicalType": "decimal", "name": "d", "precision": 4, "scale": 2, "type": "bytes" } ], "name": "my_record", "type": "record" }' )""" % path assert result == expected def test_create_table_parquet(self): statement = _create_table('some_table', self.expr, database='bar', can_exist=False) result = statement.compile() expected = """\ CREATE TABLE bar.`some_table` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_no_overwrite(self): statement = _create_table('tname', self.expr, can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_avro_other_formats(self): statement = _create_table('tname', self.t, format='avro', can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS AVRO AS SELECT * FROM functional_alltypes""" assert result == expected self.assertRaises(ValueError, _create_table, 'tname', self.t, format='foo') def test_partition_by(self): pass
class TestStringBuiltins(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_unary_ops(self): s = self.table.string_col cases = [ (s.lower(), 'lower(`string_col`)'), (s.upper(), 'upper(`string_col`)'), (s.reverse(), 'reverse(`string_col`)'), (s.strip(), 'trim(`string_col`)'), (s.lstrip(), 'ltrim(`string_col`)'), (s.rstrip(), 'rtrim(`string_col`)'), (s.capitalize(), 'initcap(`string_col`)'), (s.length(), 'length(`string_col`)'), (s.ascii_str(), 'ascii(`string_col`)') ] self._check_expr_cases(cases) def test_substr(self): # Database numbers starting from 1 cases = [ (self.table.string_col.substr(2), 'substr(`string_col`, 2 + 1)'), (self.table.string_col.substr(0, 3), 'substr(`string_col`, 0 + 1, 3)') ] self._check_expr_cases(cases) def test_strright(self): cases = [ (self.table.string_col.right(4), 'strright(`string_col`, 4)') ] self._check_expr_cases(cases) def test_like(self): cases = [ (self.table.string_col.like('foo%'), "`string_col` LIKE 'foo%'") ] self._check_expr_cases(cases) def test_rlike(self): ex = "`string_col` RLIKE '[\d]+'" cases = [ (self.table.string_col.rlike('[\d]+'), ex), (self.table.string_col.re_search('[\d]+'), ex), ] self._check_expr_cases(cases) def test_re_extract(self): sql = "regexp_extract(`string_col`, '[\d]+', 0)" cases = [ (self.table.string_col.re_extract('[\d]+', 0), sql) ] self._check_expr_cases(cases) def test_re_replace(self): sql = "regexp_replace(`string_col`, '[\d]+', 'aaa')" cases = [ (self.table.string_col.re_replace('[\d]+', 'aaa'), sql) ] self._check_expr_cases(cases) def test_parse_url(self): sql = "parse_url(`string_col`, 'HOST')" cases = [ (self.table.string_col.parse_url('HOST'), sql) ] self._check_expr_cases(cases) def test_repeat(self): cases = [ (self.table.string_col.repeat(2), 'repeat(`string_col`, 2)') ] self._check_expr_cases(cases) def test_translate(self): cases = [ (self.table.string_col.translate('a', 'b'), "translate(`string_col`, 'a', 'b')") ] self._check_expr_cases(cases) def test_find(self): s = self.table.string_col i1 = self.table.tinyint_col cases = [ (s.find('a'), "locate('a', `string_col`) - 1"), (s.find('a', 2), "locate('a', `string_col`, 3) - 1"), (s.find('a', start=i1), "locate('a', `string_col`, `tinyint_col` + 1) - 1") ] self._check_expr_cases(cases) def test_lpad(self): cases = [ (self.table.string_col.lpad(1, 'a'), "lpad(`string_col`, 1, 'a')"), (self.table.string_col.lpad(25), "lpad(`string_col`, 25, ' ')") ] self._check_expr_cases(cases) def test_rpad(self): cases = [ (self.table.string_col.rpad(1, 'a'), "rpad(`string_col`, 1, 'a')"), (self.table.string_col.rpad(25), "rpad(`string_col`, 25, ' ')") ] self._check_expr_cases(cases) def test_find_in_set(self): cases = [ (self.table.string_col.find_in_set(['a']), "find_in_set(`string_col`, 'a') - 1"), (self.table.string_col.find_in_set(['a', 'b']), "find_in_set(`string_col`, 'a,b') - 1") ] self._check_expr_cases(cases) def test_string_join(self): cases = [ (L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')") ] self._check_expr_cases(cases)
class TestInsertLoadData(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = self.con.table('functional_alltypes') def test_select_basics(self): name = 'testing123456' expr = self.t.limit(10) select, _ = _get_select(expr) stmt = ddl.InsertSelect(name, select, database='foo') result = stmt.compile() expected = """\ INSERT INTO foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True) result = stmt.compile() expected = """\ INSERT OVERWRITE foo.`testing123456` SELECT * FROM functional_alltypes LIMIT 10""" assert result == expected def test_load_data_unpartitioned(self): path = '/path/to/data' stmt = ddl.LoadData('functional_alltypes', path, database='foo') result = stmt.compile() expected = ("LOAD DATA INPATH '/path/to/data' " "INTO TABLE foo.`functional_alltypes`") assert result == expected stmt.overwrite = True result = stmt.compile() expected = ("LOAD DATA INPATH '/path/to/data' " "OVERWRITE INTO TABLE foo.`functional_alltypes`") assert result == expected def test_load_data_partitioned(self): path = '/path/to/data' part = {'year': 2007, 'month': 7} part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')]) stmt = ddl.LoadData('functional_alltypes', path, database='foo', partition=part, partition_schema=part_schema) result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected stmt.overwrite = True result = stmt.compile() expected = """\ LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes` PARTITION (year=2007, month=7)""" assert result == expected def test_select_overwrite(self): pass
class TestAnalytics(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') def test_category_project(self): t = self.alltypes tier = t.double_col.bucket([0, 50, 100]).name('tier') expr = t[tier, t] assert isinstance(expr.tier, ir.CategoryArray) def test_bucket(self): d = self.alltypes.double_col bins = [0, 10, 50, 100] expr = d.bucket(bins) assert isinstance(expr, ir.CategoryArray) assert expr.op().nbuckets == 3 expr = d.bucket(bins, include_over=True) assert expr.op().nbuckets == 4 expr = d.bucket(bins, include_over=True, include_under=True) assert expr.op().nbuckets == 5 def test_bucket_error_cases(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.bucket, []) self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo') # it works! d.bucket([10], include_under=True, include_over=True) self.assertRaises(ValueError, d.bucket, [10]) self.assertRaises(ValueError, d.bucket, [10], include_under=True) self.assertRaises(ValueError, d.bucket, [10], include_over=True) def test_histogram(self): d = self.alltypes.double_col self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5) self.assertRaises(ValueError, d.histogram) self.assertRaises(ValueError, d.histogram, 10, closed='foo') def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.origin.topk(10, by=t.arrdelay.mean()) filtered = t.filter([delay_filter]) post_pred = filtered.op().predicates[1] assert delay_filter.to_filter().equals(post_pred) def test_topk_function_late_bind(self): # GH #520 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean()) expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean()) assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
class UDFTest(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table("functional_alltypes") self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table("tpch_customer").c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): op = udf.scalar_function(["string"], "string", name="Tester") udf.add_operation(op, "identity", "udf_testing") def _identity_test(value): return op(value).to_expr() result = _identity_test("hello world") assert result == "SELECT udf_testing.identity('hello world')" def test_sql_generation_from_infoclass(self): udf_info = udf.UDFCreator("test.so", ["string"], "string", "info_test") repr(udf_info) op = udf_info.to_operation() udf.add_operation(op, "info_test", "udf_testing") assert op in _operation_registry def _infoclass_test(value): return op(value).to_expr() result = _infoclass_test("hello world") assert result == "SELECT udf_testing.info_test('hello world')" def test_boolean(self): func = self._udf_registration_single_input("boolean", "boolean", "test") expr = func(True) assert type(expr) == ir.BooleanScalar expr = func(self.b) assert type(expr) == ir.BooleanArray def test_tinyint(self): func = self._udf_registration_single_input("int8", "int8", "test") expr = func(1) assert type(expr) == ir.Int8Scalar expr = func(self.i8) assert type(expr) == ir.Int8Array def test_smallint(self): func = self._udf_registration_single_input("int16", "int16", "test") expr = func(1) assert type(expr) == ir.Int16Scalar expr = func(self.i16) assert type(expr) == ir.Int16Array def test_int(self): func = self._udf_registration_single_input("int32", "int32", "test") expr = func(1) assert type(expr) == ir.Int32Scalar expr = func(self.i32) assert type(expr) == ir.Int32Array def test_bigint(self): func = self._udf_registration_single_input("int64", "int64", "test") expr = func(1) assert type(expr) == ir.Int64Scalar expr = func(self.i64) assert type(expr) == ir.Int64Array def test_float(self): func = self._udf_registration_single_input("float", "float", "test") expr = func(1.0) assert type(expr) == ir.FloatScalar expr = func(self.f) assert type(expr) == ir.FloatArray def test_double(self): func = self._udf_registration_single_input("double", "double", "test") expr = func(1.0) assert type(expr) == ir.DoubleScalar expr = func(self.d) assert type(expr) == ir.DoubleArray def test_decimal(self): func = self._udf_registration_single_input("decimal(9,0)", "decimal(9,0)", "test") expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalArray def test_string(self): func = self._udf_registration_single_input("string", "string", "test") expr = func("1") assert type(expr) == ir.StringScalar expr = func(self.s) assert type(expr) == ir.StringArray def test_timestamp(self): func = self._udf_registration_single_input("timestamp", "timestamp", "test") expr = func(ibis.timestamp("1961-04-10")) assert type(expr) == ir.TimestampScalar expr = func(self.t) assert type(expr) == ir.TimestampArray def test_invalid_typecasting_tinyint(self): self._invalid_typecasts("int8", self.all_cols[1:]) def test_invalid_typecasting_smallint(self): self._invalid_typecasts("int16", self.all_cols[2:]) def test_invalid_typecasting_int(self): self._invalid_typecasts("int32", self.all_cols[3:]) def test_invalid_typecasting_bigint(self): self._invalid_typecasts("int64", self.all_cols[4:]) def test_invalid_typecasting_boolean(self): self._invalid_typecasts("boolean", self.all_cols[:8] + self.all_cols[9:]) def test_invalid_typecasting_float(self): self._invalid_typecasts("float", self.all_cols[:4] + self.all_cols[6:]) def test_invalid_typecasting_double(self): self._invalid_typecasts("double", self.all_cols[:4] + self.all_cols[6:]) def test_invalid_typecasting_string(self): self._invalid_typecasts("string", self.all_cols[:7] + self.all_cols[8:]) def test_invalid_typecasting_timestamp(self): self._invalid_typecasts("timestamp", self.all_cols[:-1]) def test_invalid_typecasting_decimal(self): self._invalid_typecasts("decimal", self.all_cols[:4] + self.all_cols[7:]) def test_mult_args(self): op = self._udf_registration(["int32", "double", "string", "boolean", "timestamp"], "int64", "mult_types") def _func(integer, double, string, boolean, timestamp): return op(integer, double, string, boolean, timestamp).to_expr() expr = _func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ArrayExpr) expr = _func(1, 1.0, "a", True, ibis.timestamp("1961-04-10")) assert issubclass(type(expr), ir.ScalarExpr) def _udf_registration_single_input(self, inputs, output, name): op = self._udf_registration([inputs], output, name) def _test_func(value): return op(value).to_expr() return _test_func def _udf_registration(self, inputs, output, name): op = udf.scalar_function(inputs, output, name=name) assert issubclass(op, ValueOp) udf.add_operation(op, name, "ibis_testing") return op def _invalid_typecasts(self, inputs, invalid_casts): func = self._udf_registration_single_input(inputs, "int32", "typecast") for in_type in invalid_casts: self.assertRaises(IbisTypeError, func, in_type)
class TestFixedOffsets(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_upconvert(self): cases = [ (T.day(14), 'w', T.week(2)), (T.hour(72), 'd', T.day(3)), (T.minute(240), 'h', T.hour(4)), (T.second(360), 'm', T.minute(6)), (T.second(3 * 86400), 'd', T.day(3)), (T.millisecond(5000), 's', T.second(5)), (T.microsecond(5000000), 's', T.second(5)), (T.nanosecond(5000000000), 's', T.second(5)), ] for offset, unit, expected in cases: result = offset.to_unit(unit) assert result.equals(expected) def test_multiply(self): offset = T.day(2) assert (offset * 2).equals(T.day(4)) assert (offset * (-2)).equals(T.day(-4)) assert (3 * offset).equals(T.day(6)) assert ((-3) * offset).equals(T.day(-6)) def test_repr(self): assert repr(T.day()) == '<Timedelta: 1 day>' assert repr(T.day(2)) == '<Timedelta: 2 days>' assert repr(T.year()) == '<Timedelta: 1 year>' assert repr(T.month(2)) == '<Timedelta: 2 months>' assert repr(T.second(40)) == '<Timedelta: 40 seconds>' def test_cannot_upconvert(self): cases = [ (T.day(), 'w'), (T.hour(), 'd'), (T.minute(), 'h'), (T.second(), 'm'), (T.second(), 'd'), (T.millisecond(), 's'), (T.microsecond(), 's'), (T.nanosecond(), 's'), ] for delta, target in cases: self.assertRaises(IbisError, delta.to_unit, target) def test_downconvert_second_parts(self): K = 2 sec = T.second(K) milli = T.millisecond(K) micro = T.microsecond(K) nano = T.nanosecond(K) cases = [ (sec.to_unit('s'), T.second(K)), (sec.to_unit('ms'), T.millisecond(K * 1000)), (sec.to_unit('us'), T.microsecond(K * 1000000)), (sec.to_unit('ns'), T.nanosecond(K * 1000000000)), (milli.to_unit('ms'), T.millisecond(K)), (milli.to_unit('us'), T.microsecond(K * 1000)), (milli.to_unit('ns'), T.nanosecond(K * 1000000)), (micro.to_unit('us'), T.microsecond(K)), (micro.to_unit('ns'), T.nanosecond(K * 1000)), (nano.to_unit('ns'), T.nanosecond(K)) ] self._check_cases(cases) def test_downconvert_hours(self): K = 2 offset = T.hour(K) cases = [ (offset.to_unit('h'), T.hour(K)), (offset.to_unit('m'), T.minute(K * 60)), (offset.to_unit('s'), T.second(K * 3600)), (offset.to_unit('ms'), T.millisecond(K * 3600000)), (offset.to_unit('us'), T.microsecond(K * 3600000000)), (offset.to_unit('ns'), T.nanosecond(K * 3600000000000)) ] self._check_cases(cases) def test_downconvert_day(self): K = 2 week = T.week(K) day = T.day(K) cases = [ (week.to_unit('d'), T.day(K * 7)), (week.to_unit('h'), T.hour(K * 7 * 24)), (day.to_unit('d'), T.day(K)), (day.to_unit('h'), T.hour(K * 24)), (day.to_unit('m'), T.minute(K * 1440)), (day.to_unit('s'), T.second(K * 86400)), (day.to_unit('ms'), T.millisecond(K * 86400000)), (day.to_unit('us'), T.microsecond(K * 86400000000)), (day.to_unit('ns'), T.nanosecond(K * 86400000000000)) ] self._check_cases(cases) def test_combine_with_different_kinds(self): cases = [ (T.day() + T.minute(), T.minute(1441)), (T.second() + T.millisecond(10), T.millisecond(1010)), (T.hour() + T.minute(5) + T.second(10), T.second(3910)) ] self._check_cases(cases) def test_timedelta_generic_api(self): cases = [ (T.timedelta(weeks=2), T.week(2)), (T.timedelta(days=3), T.day(3)), (T.timedelta(hours=4), T.hour(4)), (T.timedelta(minutes=5), T.minute(5)), (T.timedelta(seconds=6), T.second(6)), (T.timedelta(milliseconds=7), T.millisecond(7)), (T.timedelta(microseconds=8), T.microsecond(8)), (T.timedelta(nanoseconds=9), T.nanosecond(9)), ] self._check_cases(cases) def _check_cases(self, cases): for x, y in cases: assert x.equals(y) def test_offset_timestamp_expr(self): c = self.table.i x = T.timedelta(days=1) expr = x + c assert isinstance(expr, ir.TimestampArray) assert isinstance(expr.op(), ops.TimestampDelta) # test radd expr = c + x assert isinstance(expr, ir.TimestampArray) assert isinstance(expr.op(), ops.TimestampDelta)
class TestDecimal(unittest.TestCase): def setUp(self): self.con = MockConnection() self.lineitem = self.con.table('tpch_lineitem') def test_type_metadata(self): col = self.lineitem.l_extendedprice assert isinstance(col, ir.DecimalArray) assert col._precision == 12 assert col._scale == 2 def test_cast_scalar_to_decimal(self): val = api.literal('1.2345') casted = val.cast('decimal(15,5)') assert isinstance(casted, ir.DecimalScalar) assert casted._precision == 15 assert casted._scale == 5 def test_decimal_aggregate_function_behavior(self): # From the Impala documentation: "The result of an aggregate function # such as MAX(), SUM(), or AVG() on DECIMAL values is promoted to a # scale of 38, with the same precision as the underlying column. Thus, # the result can represent the largest possible value at that # particular precision." col = self.lineitem.l_extendedprice functions = ['sum', 'mean', 'max', 'min'] for func_name in functions: result = getattr(col, func_name)() assert isinstance(result, ir.DecimalScalar) assert result._precision == col._precision assert result._scale == 38 def test_where(self): table = self.lineitem q = table.l_quantity expr = api.where(table.l_discount > 0, q * table.l_discount, api.null) assert isinstance(expr, ir.DecimalArray) expr = api.where(table.l_discount > 0, (q * table.l_discount).sum(), api.null) assert isinstance(expr, ir.DecimalArray) expr = api.where(table.l_discount.sum() > 0, (q * table.l_discount).sum(), api.null) assert isinstance(expr, ir.DecimalScalar) def test_fillna(self): expr = self.lineitem.l_extendedprice.fillna(0) assert isinstance(expr, ir.DecimalArray) expr = self.lineitem.l_extendedprice.fillna( self.lineitem.l_quantity) assert isinstance(expr, ir.DecimalArray) def test_precision_scale(self): col = self.lineitem.l_extendedprice p = col.precision() s = col.scale() assert isinstance(p, ir.IntegerValue) assert isinstance(p.op(), ops.DecimalPrecision) assert isinstance(s, ir.IntegerValue) assert isinstance(s.op(), ops.DecimalScale) def test_invalid_precision_scale_combo(self): pass
class TestASTBuilder(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_ast_with_projection_join_filter(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) result = joined[[table3, table2['value']]] ast = build_ast(result) stmt = ast.queries[0] def foo(): table3 = table[filter_pred] joined = table2.inner_join(table3, [join_pred]) result = joined[[table3, table2['value']]] return result assert len(stmt.select_set) == 2 assert len(stmt.where) == 1 assert stmt.where[0] is filter_pred # Check that the join has been rebuilt to only include the root tables tbl = stmt.table_set tbl_node = tbl.op() assert isinstance(tbl_node, ops.InnerJoin) assert tbl_node.left is table2 assert tbl_node.right is table # table expression substitution has been made in the predicate assert tbl_node.predicates[0].equals(table['g'] == table2['key']) def test_ast_with_aggregation_join_filter(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) ast = build_ast(result) stmt = ast.queries[0] # hoisted metrics ex_metrics = [(table['f'] - table2['value']).mean().name('foo'), table['f'].sum().name('bar')] ex_by = [table['g'], table2['key']] # hoisted join and aggregate expected_table_set = \ table2.inner_join(table, [table['g'] == table2['key']]) assert stmt.table_set.equals(expected_table_set) # Check various exprs for res, ex in zip(stmt.select_set, ex_by + ex_metrics): assert res.equals(ex) for res, ex in zip(stmt.group_by, ex_by): assert stmt.select_set[res].equals(ex) # Check we got the filter assert len(stmt.where) == 1 assert stmt.where[0].equals(filter_pred) def test_sort_by(self): table = self.con.table('star1') what = table.sort_by('f') result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `f`""" assert result == expected what = table.sort_by(('f', 0)) result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `f` DESC""" assert result == expected what = table.sort_by(['c', ('f', 0)]) result = to_sql(what) expected = """SELECT * FROM star1 ORDER BY `c`, `f` DESC""" assert result == expected def test_limit(self): table = self.con.table('star1').limit(10) result = to_sql(table) expected = """SELECT * FROM star1 LIMIT 10""" assert result == expected table = self.con.table('star1').limit(10, offset=5) result = to_sql(table) expected = """SELECT * FROM star1 LIMIT 10 OFFSET 5""" assert result == expected # Put the limit in a couple places in the stack table = self.con.table('star1') table = table[table.f > 0].limit(10) result = to_sql(table) expected = """SELECT * FROM star1 WHERE `f` > 0 LIMIT 10""" assert result == expected table = self.con.table('star1') # Semantically, this should produce a subquery table = table.limit(10) table = table[table.f > 0] result2 = to_sql(table) expected2 = """SELECT * FROM ( SELECT * FROM star1 LIMIT 10 ) t0 WHERE `f` > 0""" assert result2 == expected2 def test_join_with_limited_table(self): t1 = self.con.table('star1') t2 = self.con.table('star2') limited = t1.limit(100) joined = (limited.inner_join(t2, [limited.foo_id == t2.foo_id])[[limited]]) result = to_sql(joined) expected = """SELECT t0.* FROM ( SELECT * FROM star1 LIMIT 100 ) t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_sort_by_on_limit_yield_subquery(self): # x.limit(...).sort_by(...) # is semantically different from # x.sort_by(...).limit(...) # and will often yield different results t = self.con.table('functional_alltypes') expr = (t.group_by('string_col').aggregate( [t.count().name('nrows')]).limit(5).sort_by('string_col')) result = to_sql(expr) expected = """SELECT * FROM ( SELECT `string_col`, count(*) AS `nrows` FROM functional_alltypes GROUP BY 1 LIMIT 5 ) t0 ORDER BY `string_col`""" assert result == expected def test_multiple_limits(self): t = self.con.table('functional_alltypes') expr = t.limit(20).limit(10) stmt = build_ast(expr).queries[0] assert stmt.limit['n'] == 10 def test_top_convenience(self): # x.top(10, by=field) # x.top(10, by=[field1, field2]) pass def test_self_aggregate_in_predicate(self): # Per ibis #43 pass
class TestCreateTable(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = t = self.con.table("functional_alltypes") self.expr = t[t.bigint_col > 0] def test_create_external_table_as(self): path = "/path/to/table" select = build_ast(self.con.table("test1")).queries[0] statement = ddl.CTAS("another_table", select, external=True, can_exist=False, path=path, database="foo") result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` STORED AS PARQUET LOCATION '{0}' AS SELECT * FROM test1""".format( path ) assert result == expected def test_create_table_with_location(self): path = "/path/to/table" schema = ibis.schema([("foo", "string"), ("bar", "int8"), ("baz", "int16")]) statement = ddl.CreateTableWithSchema( "another_table", schema, ddl.NoFormat(), can_exist=False, path=path, database="foo" ) result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) LOCATION '{0}'""".format( path ) assert result == expected def test_create_table_like_parquet(self): directory = "/path/to/" path = "/path/to/parquetfile" statement = ddl.CreateTableParquet("new_table", directory, example_file=path, can_exist=True, database="foo") result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE PARQUET '{0}' STORED AS PARQUET LOCATION '{1}'""".format( path, directory ) assert result == expected def test_create_table_parquet_like_other(self): # alternative to "LIKE PARQUET" directory = "/path/to/" example_table = "db.other" statement = ddl.CreateTableParquet( "new_table", directory, example_table=example_table, can_exist=True, database="foo" ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE {0} STORED AS PARQUET LOCATION '{1}'""".format( example_table, directory ) assert result == expected def test_create_table_parquet_with_schema(self): directory = "/path/to/" schema = ibis.schema([("foo", "string"), ("bar", "int8"), ("baz", "int16")]) statement = ddl.CreateTableParquet( "new_table", directory, schema=schema, external=True, can_exist=True, database="foo" ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format( directory ) assert result == expected def test_create_table_delimited(self): path = "/path/to/files/" schema = ibis.schema([("a", "string"), ("b", "int32"), ("c", "double"), ("d", "decimal(12,2)")]) stmt = ddl.CreateTableDelimited( "new_table", path, schema, delimiter="|", escapechar="\\", lineterminator="\0", database="foo", can_exist=True, ) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format( path ) assert result == expected def test_create_external_table_avro(self): path = "/path/to/files/" avro_schema = { "fields": [ {"name": "a", "type": "string"}, {"name": "b", "type": "int"}, {"name": "c", "type": "double"}, {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2, "name": "d"}, ], "name": "my_record", "type": "record", } stmt = ddl.CreateTableAvro("new_table", path, avro_schema, database="foo", can_exist=True) result = stmt.compile() expected = ( """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` STORED AS AVRO LOCATION '%s' TBLPROPERTIES ('avro.schema.literal'='{ "fields": [ { "name": "a", "type": "string" }, { "name": "b", "type": "int" }, { "name": "c", "type": "double" }, { "logicalType": "decimal", "name": "d", "precision": 4, "scale": 2, "type": "bytes" } ], "name": "my_record", "type": "record" }')""" % path ) assert result == expected def test_create_table_parquet(self): statement = _create_table("some_table", self.expr, database="bar", can_exist=False) result = statement.compile() expected = """\ CREATE TABLE bar.`some_table` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_no_overwrite(self): statement = _create_table("tname", self.expr, can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_avro_other_formats(self): statement = _create_table("tname", self.t, format="avro", can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS AVRO AS SELECT * FROM functional_alltypes""" assert result == expected self.assertRaises(ValueError, _create_table, "tname", self.t, format="foo") def test_partition_by(self): pass
class TestNonTabularResults(unittest.TestCase): """ """ def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_simple_scalar_aggregates(self): from pandas import DataFrame # Things like table.column.{sum, mean, ...}() table = self.con.table('alltypes') expr = table[table.c > 0].f.sum() ast = build_ast(expr) query = ast.queries[0] sql_query = query.compile() expected = """SELECT sum(`f`) AS `tmp` FROM alltypes WHERE `c` > 0""" assert sql_query == expected # Maybe the result handler should act on the cursor. Not sure. handler = query.result_handler output = DataFrame({'tmp': [5]}) assert handler(output) == 5 def test_table_column_unbox(self): from pandas import DataFrame table = self.table m = table.f.sum().name('total') agged = table[table.c > 0].group_by('g').aggregate([m]) expr = agged.g ast = build_ast(expr) query = ast.queries[0] sql_query = query.compile() expected = """SELECT `g`, sum(`f`) AS `total` FROM alltypes WHERE `c` > 0 GROUP BY 1""" assert sql_query == expected # Maybe the result handler should act on the cursor. Not sure. handler = query.result_handler output = DataFrame({'g': ['foo', 'bar', 'baz']}) assert (handler(output) == output['g']).all() def test_complex_array_expr_projection(self): # May require finding the base table and forming a projection. expr = (self.table.group_by('g').aggregate( [self.table.count().name('count')])) expr2 = expr.g.cast('double') query = to_sql(expr2) expected = """SELECT CAST(`g` AS double) AS `tmp` FROM ( SELECT `g`, count(*) AS `count` FROM alltypes GROUP BY 1 ) t0""" assert query == expected def test_scalar_exprs_no_table_refs(self): expr1 = ibis.now() expected1 = """\ SELECT now() AS `tmp`""" expr2 = ibis.literal(1) + ibis.literal(2) expected2 = """\ SELECT 1 + 2 AS `tmp`""" cases = [(expr1, expected1), (expr2, expected2)] for expr, expected in cases: result = to_sql(expr) assert result == expected def test_expr_list_no_table_refs(self): exlist = ibis.api.expr_list([ ibis.literal(1).name('a'), ibis.now().name('b'), ibis.literal(2).log().name('c') ]) result = to_sql(exlist) expected = """\ SELECT 1 AS `a`, now() AS `b`, ln(2) AS `c`""" assert result == expected def test_isnull_case_expr_rewrite_failure(self): # #172, case expression that was not being properly converted into an # aggregation reduction = self.table.g.isnull().ifelse(1, 0).sum() result = to_sql(reduction) expected = """\ SELECT sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END) AS `tmp` FROM alltypes""" assert result == expected
class TestCreateTable(unittest.TestCase): def setUp(self): self.con = MockConnection() self.t = t = self.con.table('functional_alltypes') self.expr = t[t.bigint_col > 0] def test_create_external_table_as(self): path = '/path/to/table' select = build_ast(self.con.table('test1')).queries[0] statement = ddl.CTAS('another_table', select, external=True, can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE foo.`another_table` STORED AS PARQUET LOCATION '{0}' AS SELECT * FROM test1""".format(path) assert result == expected def test_create_table_with_location(self): path = '/path/to/table' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableWithSchema('another_table', schema, ddl.NoFormat(), can_exist=False, path=path, database='foo') result = statement.compile() expected = """\ CREATE TABLE foo.`another_table` (`foo` string, `bar` tinyint, `baz` smallint) LOCATION '{0}'""".format(path) assert result == expected def test_create_table_like_parquet(self): directory = '/path/to/' path = '/path/to/parquetfile' statement = ddl.CreateTableParquet('new_table', directory, example_file=path, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE PARQUET '{0}' STORED AS PARQUET LOCATION '{1}'""".format(path, directory) assert result == expected def test_create_table_parquet_like_other(self): # alternative to "LIKE PARQUET" directory = '/path/to/' example_table = 'db.other' statement = ddl.CreateTableParquet('new_table', directory, example_table=example_table, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE {0} STORED AS PARQUET LOCATION '{1}'""".format(example_table, directory) assert result == expected def test_create_table_parquet_with_schema(self): directory = '/path/to/' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableParquet('new_table', directory, schema=schema, external=True, can_exist=True, database='foo') result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(directory) assert result == expected def test_create_table_delimited(self): path = '/path/to/files/' schema = ibis.schema([('a', 'string'), ('b', 'int32'), ('c', 'double'), ('d', 'decimal(12,2)')]) stmt = ddl.CreateTableDelimited('new_table', path, schema, delimiter='|', escapechar='\\', lineterminator='\0', database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`a` string, `b` int, `c` double, `d` decimal(12,2)) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\' LINES TERMINATED BY '\0' LOCATION '{0}'""".format(path) assert result == expected def test_create_external_table_avro(self): path = '/path/to/files/' avro_schema = { 'fields': [ {'name': 'a', 'type': 'string'}, {'name': 'b', 'type': 'int'}, {'name': 'c', 'type': 'double'}, {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2, 'name': 'd'} ], 'name': 'my_record', 'type': 'record' } stmt = ddl.CreateTableAvro('new_table', path, avro_schema, database='foo', can_exist=True) result = stmt.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` STORED AS AVRO LOCATION '%s' TBLPROPERTIES ('avro.schema.literal'='{ "fields": [ { "name": "a", "type": "string" }, { "name": "b", "type": "int" }, { "name": "c", "type": "double" }, { "logicalType": "decimal", "name": "d", "precision": 4, "scale": 2, "type": "bytes" } ], "name": "my_record", "type": "record" }')""" % path assert result == expected def test_create_table_parquet(self): statement = _create_table('some_table', self.expr, database='bar', can_exist=False) result = statement.compile() expected = """\ CREATE TABLE bar.`some_table` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_no_overwrite(self): statement = _create_table('tname', self.expr, can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS PARQUET AS SELECT * FROM functional_alltypes WHERE `bigint_col` > 0""" assert result == expected def test_avro_other_formats(self): statement = _create_table('tname', self.t, format='avro', can_exist=True) result = statement.compile() expected = """\ CREATE TABLE IF NOT EXISTS `tname` STORED AS AVRO AS SELECT * FROM functional_alltypes""" assert result == expected self.assertRaises(ValueError, _create_table, 'tname', self.t, format='foo') def test_partition_by(self): pass
class TestBuiltins(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') self.lineitem = self.con.table('tpch_lineitem') def test_abs(self): colnames = ['tinyint_col', 'smallint_col', 'int_col', 'bigint_col', 'float_col', 'double_col'] fname = 'abs' op = ops.Abs for col in colnames: expr = self.alltypes[col] self._check_unary_op(expr, fname, op, type(expr)) expr = self.lineitem.l_extendedprice self._check_unary_op(expr, fname, op, type(expr)) def test_group_concat(self): col = self.alltypes.string_col expr = col.group_concat() assert isinstance(expr.op(), ops.GroupConcat) arg, sep = expr.op().args assert sep == ',' expr = col.group_concat('|') arg, sep = expr.op().args assert sep == '|' def test_zeroifnull(self): dresult = self.alltypes.double_col.zeroifnull() iresult = self.alltypes.int_col.zeroifnull() assert type(dresult.op()) == ops.ZeroIfNull assert type(dresult) == ir.DoubleArray # Impala upconverts all ints to bigint. Hmm. assert type(iresult) == type(iresult) def test_fillna(self): result = self.alltypes.double_col.fillna(5) assert isinstance(result, ir.DoubleArray) assert isinstance(result.op(), ops.IfNull) result = self.alltypes.bool_col.fillna(True) assert isinstance(result, ir.BooleanArray) # Retains type of caller (for now) result = self.alltypes.int_col.fillna(self.alltypes.bigint_col) assert isinstance(result, ir.Int32Array) def test_ceil_floor(self): cresult = self.alltypes.double_col.ceil() fresult = self.alltypes.double_col.floor() assert isinstance(cresult, ir.Int64Array) assert isinstance(fresult, ir.Int64Array) assert type(cresult.op()) == ops.Ceil assert type(fresult.op()) == ops.Floor cresult = api.literal(1.2345).ceil() fresult = api.literal(1.2345).floor() assert isinstance(cresult, ir.Int64Scalar) assert isinstance(fresult, ir.Int64Scalar) dec_col = self.lineitem.l_extendedprice cresult = dec_col.ceil() fresult = dec_col.floor() assert isinstance(cresult, ir.DecimalArray) assert cresult.meta == dec_col.meta assert isinstance(fresult, ir.DecimalArray) assert fresult.meta == dec_col.meta def test_sign(self): result = self.alltypes.double_col.sign() assert isinstance(result, ir.FloatArray) assert type(result.op()) == ops.Sign result = api.literal(1.2345).sign() assert isinstance(result, ir.FloatScalar) dec_col = self.lineitem.l_extendedprice result = dec_col.sign() assert isinstance(result, ir.FloatArray) def test_round(self): result = self.alltypes.double_col.round() assert isinstance(result, ir.Int64Array) assert result.op().args[1] is None result = self.alltypes.double_col.round(2) assert isinstance(result, ir.DoubleArray) assert result.op().args[1] == 2 # Even integers are double (at least in Impala, check with other DB # implementations) result = self.alltypes.int_col.round(2) assert isinstance(result, ir.DoubleArray) dec = self.lineitem.l_extendedprice result = dec.round() assert isinstance(result, ir.DecimalArray) result = dec.round(2) assert isinstance(result, ir.DecimalArray) result = api.literal(1.2345).round() assert isinstance(result, ir.Int64Scalar) def _check_unary_op(self, expr, fname, ex_op, ex_type): result = getattr(expr, fname)() assert type(result.op()) == ex_op assert type(result) == ex_type
class TestStringOps(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_lower_upper(self): lresult = self.table.g.lower() uresult = self.table.g.upper() assert isinstance(lresult, ir.StringArray) assert isinstance(uresult, ir.StringArray) assert isinstance(lresult.op(), ops.Lowercase) assert isinstance(uresult.op(), ops.Uppercase) lit = literal('FoO') lresult = lit.lower() uresult = lit.upper() assert isinstance(lresult, ir.StringScalar) assert isinstance(uresult, ir.StringScalar) def test_substr(self): lit = literal('FoO') result = self.table.g.substr(2, 4) lit_result = lit.substr(0, 2) assert isinstance(result, ir.StringArray) assert isinstance(lit_result, ir.StringScalar) op = result.op() assert isinstance(op, ops.Substring) start, length = op.args[1:] assert start.equals(literal(2)) assert length.equals(literal(4)) def test_left_right(self): result = self.table.g.left(5) expected = self.table.g.substr(0, 5) assert result.equals(expected) result = self.table.g.right(5) op = result.op() assert isinstance(op, ops.StrRight) assert op.args[1].equals(literal(5)) def test_length(self): lit = literal('FoO') result = self.table.g.length() lit_result = lit.length() assert isinstance(result, ir.Int32Array) assert isinstance(lit_result, ir.Int32Scalar) assert isinstance(result.op(), ops.StringLength) def test_join(self): dash = literal('-') expr = dash.join([self.table.f.cast('string'), self.table.g]) assert isinstance(expr, ir.StringArray) expr = dash.join([literal('ab'), literal('cd')]) assert isinstance(expr, ir.StringScalar) def test_contains(self): expr = self.table.g.contains('foo') expected = self.table.g.find('foo') >= 0 assert_equal(expr, expected) self.assertRaises(Exception, lambda: 'foo' in self.table.g) def test_getitem_slice(self): cases = [ (self.table.g[:3], self.table.g.substr(0, 3)), (self.table.g[2:6], self.table.g.substr(2, 4)), ] for case, expected in cases: assert_equal(case, expected)
class TestTimestamp(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('alltypes') self.col = self.alltypes.i def test_field_select(self): assert isinstance(self.col, ir.TimestampArray) def test_string_cast_to_timestamp(self): casted = self.alltypes.g.cast('timestamp') assert isinstance(casted, ir.TimestampArray) string = api.literal('2000-01-01') casted = string.cast('timestamp') assert isinstance(casted, ir.TimestampScalar) def test_extract_fields(self): # type-size may be database specific cases = [ ('year', ops.ExtractYear, ir.Int32Array), ('month', ops.ExtractMonth, ir.Int32Array), ('day', ops.ExtractDay, ir.Int32Array), ('hour', ops.ExtractHour, ir.Int32Array), ('minute', ops.ExtractMinute, ir.Int32Array), ('second', ops.ExtractSecond, ir.Int32Array) ] for attr, ex_op, ex_type in cases: result = getattr(self.col, attr)() assert isinstance(result, ex_type) assert isinstance(result.op(), ex_op) def test_extract_no_propagate_name(self): # see #146 table = self.con.table('functional_alltypes') expr = table.timestamp_col.hour() self.assertRaises(com.ExpressionError, expr.get_name) def test_now(self): result = api.now() assert isinstance(result, ir.TimestampScalar) assert isinstance(result.op(), ops.TimestampNow) def test_timestamp_literals(self): ts_str = '2015-01-01 00:00:00' val = pd.Timestamp(ts_str) expr = ibis.literal(val) assert isinstance(expr, ir.TimestampScalar) expr = ibis.timestamp(ts_str) assert isinstance(expr, ir.TimestampScalar) self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71') def test_integer_to_timestamp(self): # #246 pass def test_comparison_timestamp(self): expr = self.col > (self.col.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanArray) def test_comparisons_string(self): val = '2015-01-01 00:00:00' expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar) expr2 = val < self.col op = expr2.op() assert isinstance(op, ops.Greater) assert isinstance(op.right, ir.TimestampScalar) def test_comparisons_pandas_timestamp(self): val = pd.Timestamp('2015-01-01 00:00:00') expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar)
class TestCaseExprs(unittest.TestCase, ExprSQLTest, ExprTestCases): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_isnull_1_0(self): expr = self.table.g.isnull().ifelse(1, 0) result = self._translate(expr) expected = 'CASE WHEN `g` IS NULL THEN 1 ELSE 0 END' assert result == expected # inside some other function result = self._translate(expr.sum()) expected = 'sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END)' assert result == expected def test_simple_case(self): expr = self._case_simple_case() result = self._translate(expr) expected = """CASE `g` WHEN 'foo' THEN 'bar' WHEN 'baz' THEN 'qux' ELSE 'default' END""" assert result == expected def test_search_case(self): expr = self._case_search_case() result = self._translate(expr) expected = """CASE WHEN `f` > 0 THEN `d` * 2 WHEN `c` < 0 THEN `a` * 2 ELSE NULL END""" assert result == expected def test_where_use_if(self): expr = ibis.where(self.table.f > 0, self.table.e, self.table.a) assert isinstance(expr, ir.FloatValue) result = self._translate(expr) expected = "if(`f` > 0, `e`, `a`)" assert result == expected def test_nullif_ifnull(self): table = self.con.table('tpch_lineitem') f = table.l_quantity cases = [ (f.nullif(f == 0), 'nullif(`l_quantity`, `l_quantity` = 0)'), (f.fillna(0), 'isnull(`l_quantity`, CAST(0 AS decimal(12,2)))'), ] self._check_expr_cases(cases) def test_decimal_fillna_cast_arg(self): table = self.con.table('tpch_lineitem') f = table.l_extendedprice cases = [ (f.fillna(0), 'isnull(`l_extendedprice`, CAST(0 AS decimal(12,2)))'), (f.fillna(0.0), 'isnull(`l_extendedprice`, 0.0)'), ] self._check_expr_cases(cases)
class TestExprFormatting(unittest.TestCase): # Uncertain about how much we want to commit to unit tests around the # particulars of the output at the moment. def setUp(self): self.schema = [ ('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean') ] self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema) self.con = MockConnection() def test_format_table_column(self): # GH #507 result = repr(self.table.f) assert 'Column[array(double)]' in result def test_format_projection(self): # This should produce a ref to the projection proj = self.table[['c', 'a', 'f']] repr(proj['a']) def test_table_type_output(self): foo = ibis.table( [ ('job', 'string'), ('dept_id', 'string'), ('year', 'int32'), ('y', 'double') ], 'foo') expr = foo.dept_id == foo.view().dept_id result = repr(expr) assert 'SelfReference[table]' in result assert 'UnboundTable[table]' in result def test_memoize_aggregate_correctly(self): table = self.table agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis') agg_exprs = [table['a'].sum().name('sum(a)'), table['b'].mean().name('mean(b)'), agg_expr] result = table.aggregate(agg_exprs, by=['g']) formatter = ExprFormatter(result) formatted = formatter.get_result() alias = formatter.memo.get_alias(table.op()) assert formatted.count(alias) == 7 def test_aggregate_arg_names(self): # Not sure how to test this *well* t = self.table by_exprs = [t.g.name('key1'), t.f.round().name('key2')] agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')] expr = self.table.group_by(by_exprs).aggregate(agg_exprs) result = repr(expr) assert 'metrics' in result assert 'by' in result def test_format_multiple_join_with_projection(self): # Star schema with fact table table = ibis.table([ ('c', 'int32'), ('f', 'double'), ('foo_id', 'string'), ('bar_id', 'string'), ]) table2 = ibis.table([ ('foo_id', 'string'), ('value1', 'double') ]) table3 = ibis.table([ ('bar_id', 'string'), ('value2', 'double') ]) filtered = table[table['f'] > 0] pred1 = table['foo_id'] == table2['foo_id'] pred2 = filtered['bar_id'] == table3['bar_id'] j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields view = j2[[table, table2['value1'], table3['value2']]] # it works! repr(view) def test_memoize_database_table(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) formatted = repr(result) assert formatted.count('test1') == 1 assert formatted.count('test2') == 1 def test_memoize_filtered_table(self): airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) result = repr(delay_filter) assert result.count('Filter') == 1 def test_memoize_insert_sort_key(self): table = self.con.table('airlines') t = table['arrdelay', 'dest'] expr = (t.group_by('dest') .mutate(dest_avg=t.arrdelay.mean(), dev=t.arrdelay - t.arrdelay.mean())) worst = expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10) result = repr(worst) assert result.count('airlines') == 1 def test_named_value_expr_show_name(self): expr = self.table.f * 2 expr2 = expr.name('baz') # it works! repr(expr) result2 = repr(expr2) # not really committing to a particular output yet assert 'baz' in result2 def test_memoize_filtered_tables_in_join(self): # related: GH #667 purchases = ibis.table([('region', 'string'), ('kind', 'string'), ('user', 'int64'), ('amount', 'double')], 'purchases') metric = purchases.amount.sum().name('total') agged = (purchases.group_by(['region', 'kind']) .aggregate(metric)) left = agged[agged.kind == 'foo'] right = agged[agged.kind == 'bar'] cond = left.region == right.region joined = left.join(right, cond) result = repr(joined) assert result.count('Filter') == 2
class TestValueExprs(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table("alltypes") self.int_cols = ["a", "b", "c", "d"] self.bool_cols = ["h"] self.float_cols = ["e", "f"] def _check_literals(self, cases): for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_string_literals(self): cases = [("simple", "'simple'"), ("I can't", "'I can\\'t'"), ('An "escape"', "'An \"escape\"'")] for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_decimal_builtins(self): t = self.con.table("tpch_lineitem") col = t.l_extendedprice cases = [(col.precision(), "precision(`l_extendedprice`)"), (col.scale(), "scale(`l_extendedprice`)")] self._check_expr_cases(cases) def test_number_boolean_literals(self): cases = [(5, "5"), (1.5, "1.5"), (True, "TRUE"), (False, "FALSE")] self._check_literals(cases) def test_column_ref_table_aliases(self): context = ImpalaContext() table1 = ibis.table([("key1", "string"), ("value1", "double")]) table2 = ibis.table([("key2", "string"), ("value and2", "double")]) context.set_ref(table1, "t0") context.set_ref(table2, "t1") expr = table1["value1"] - table2["value and2"] result = self._translate(expr, context=context) expected = "t0.`value1` - t1.`value and2`" assert result == expected def test_column_ref_quoting(self): schema = [("has a space", "double")] table = ibis.table(schema) self._translate(table["has a space"], "`has a space`") def test_identifier_quoting(self): schema = [("date", "double"), ("table", "string")] table = ibis.table(schema) self._translate(table["date"], "`date`") self._translate(table["table"], "`table`") def test_named_expressions(self): a, b, g = self.table.get_columns(["a", "b", "g"]) cases = [ (g.cast("double").name("g_dub"), "CAST(`g` AS double) AS `g_dub`"), (g.name("has a space"), "`g` AS `has a space`"), (((a - b) * a).name("expr"), "(`a` - `b`) * `a` AS `expr`"), ] return self._check_expr_cases(cases, named=True) def test_binary_infix_operators(self): # For each function, verify that the generated code is what we expect a, b, h = self.table.get_columns(["a", "b", "h"]) bool_col = a > 0 cases = [ (a + b, "`a` + `b`"), (a - b, "`a` - `b`"), (a * b, "`a` * `b`"), (a / b, "`a` / `b`"), (a ** b, "pow(`a`, `b`)"), (a < b, "`a` < `b`"), (a <= b, "`a` <= `b`"), (a > b, "`a` > `b`"), (a >= b, "`a` >= `b`"), (a == b, "`a` = `b`"), (a != b, "`a` != `b`"), (h & bool_col, "`h` AND (`a` > 0)"), (h | bool_col, "`h` OR (`a` > 0)"), # xor is brute force (h ^ bool_col, "(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))"), ] self._check_expr_cases(cases) def test_binary_infix_parenthesization(self): a, b, c = self.table.get_columns(["a", "b", "c"]) cases = [ ((a + b) + c, "(`a` + `b`) + `c`"), (a.log() + c, "ln(`a`) + `c`"), (b + (-(a + c)), "`b` + (-(`a` + `c`))"), ] self._check_expr_cases(cases) def test_between(self): cases = [(self.table.f.between(0, 1), "`f` BETWEEN 0 AND 1")] self._check_expr_cases(cases) def test_isnull_notnull(self): cases = [ (self.table["g"].isnull(), "`g` IS NULL"), (self.table["a"].notnull(), "`a` IS NOT NULL"), ((self.table["a"] + self.table["b"]).isnull(), "`a` + `b` IS NULL"), ] self._check_expr_cases(cases) def test_casts(self): a, d, g = self.table.get_columns(["a", "d", "g"]) cases = [ (a.cast("int16"), "CAST(`a` AS smallint)"), (a.cast("int32"), "CAST(`a` AS int)"), (a.cast("int64"), "CAST(`a` AS bigint)"), (a.cast("float"), "CAST(`a` AS float)"), (a.cast("double"), "CAST(`a` AS double)"), (a.cast("string"), "CAST(`a` AS string)"), (d.cast("int8"), "CAST(`d` AS tinyint)"), (g.cast("double"), "CAST(`g` AS double)"), (g.cast("timestamp"), "CAST(`g` AS timestamp)"), ] self._check_expr_cases(cases) def test_misc_conditionals(self): a = self.table.a cases = [(a.nullif(0), "nullif(`a`, 0)")] self._check_expr_cases(cases) def test_decimal_casts(self): cases = [ (L("9.9999999").cast("decimal(38,5)"), "CAST('9.9999999' AS decimal(38,5))"), (self.table.f.cast("decimal(12,2)"), "CAST(`f` AS decimal(12,2))"), ] self._check_expr_cases(cases) def test_negate(self): cases = [(-self.table["a"], "-`a`"), (-self.table["f"], "-`f`"), (-self.table["h"], "NOT `h`")] self._check_expr_cases(cases) def test_timestamp_extract_field(self): fields = ["year", "month", "day", "hour", "minute", "second", "millisecond"] cases = [(getattr(self.table.i, field)(), "extract(`i`, '{0}')".format(field)) for field in fields] self._check_expr_cases(cases) # integration with SQL translation expr = self.table[ self.table.i.year().name("year"), self.table.i.month().name("month"), self.table.i.day().name("day") ] result = to_sql(expr) expected = """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`, extract(`i`, 'day') AS `day` FROM alltypes""" assert result == expected def test_timestamp_now(self): cases = [(ibis.now(), "now()")] self._check_expr_cases(cases) def test_timestamp_deltas(self): units = ["year", "month", "week", "day", "hour", "minute", "second", "millisecond", "microsecond"] t = self.table.i f = "`i`" cases = [] for unit in units: K = 5 offset = getattr(ibis, unit)(K) template = "{0}s_add({1}, {2})" cases.append((t + offset, template.format(unit, f, K))) cases.append((t - offset, template.format(unit, f, -K))) self._check_expr_cases(cases) def test_timestamp_literals(self): from pandas import Timestamp tv1 = "2015-01-01 12:34:56" ex1 = "'2015-01-01 12:34:56'" cases = [(L(Timestamp(tv1)), ex1), (L(Timestamp(tv1).to_pydatetime()), ex1), (ibis.timestamp(tv1), ex1)] self._check_expr_cases(cases) def test_timestamp_from_integer(self): col = self.table.c cases = [ (col.to_timestamp(), 'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") ' "AS timestamp)"), ( col.to_timestamp("ms"), "CAST(from_unixtime(CAST(`c` / 1000 AS int), " '"yyyy-MM-dd HH:mm:ss") ' "AS timestamp)", ), ( col.to_timestamp("us"), "CAST(from_unixtime(CAST(`c` / 1000000 AS int), " '"yyyy-MM-dd HH:mm:ss") ' "AS timestamp)", ), ] self._check_expr_cases(cases) def test_correlated_predicate_subquery(self): t0 = self.table t1 = t0.view() expr = t0.g == t1.g ctx = ImpalaContext() ctx.make_alias(t0) # Grab alias from parent context subctx = ctx.subcontext() subctx.make_alias(t1) subctx.make_alias(t0) result = self._translate(expr, context=subctx) expected = "t0.`g` = t1.`g`" assert result == expected def test_any_all(self): t = self.table bool_expr = t.f == 0 cases = [ (bool_expr.any(), "sum(`f` = 0) > 0"), (-bool_expr.any(), "sum(`f` = 0) = 0"), (bool_expr.all(), "sum(`f` = 0) = count(*)"), (-bool_expr.all(), "sum(`f` = 0) < count(*)"), ] self._check_expr_cases(cases)
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') def test_numeric_unary_builtins(self): # No argument functions functions = ['abs', 'ceil', 'floor', 'exp', 'sqrt', 'sign', ('log', 'ln'), ('approx_median', 'appx_median'), ('approx_nunique', 'ndv'), 'ln', 'log2', 'log10', 'nullifzero', 'zeroifnull'] cases = [] for what in functions: if isinstance(what, tuple): ibis_name, sql_name = what else: ibis_name = sql_name = what for cname in ['double_col', 'int_col']: expr = getattr(self.table[cname], ibis_name)() cases.append((expr, '{0}({1})'.format( sql_name, '`{0}`'.format(cname)))) self._check_expr_cases(cases) def test_log_other_bases(self): cases = [ (self.table.double_col.log(5), 'log(`double_col`, 5)') ] self._check_expr_cases(cases) def test_round(self): cases = [ (self.table.double_col.round(), 'round(`double_col`)'), (self.table.double_col.round(0), 'round(`double_col`, 0)'), (self.table.double_col.round(2, ), 'round(`double_col`, 2)'), (self.table.double_col.round(self.table.tinyint_col), 'round(`double_col`, `tinyint_col`)') ] self._check_expr_cases(cases) def test_hash(self): expr = self.table.int_col.hash() assert isinstance(expr, ir.Int64Array) assert isinstance(self.table.int_col.sum().hash(), ir.Int64Scalar) cases = [ (self.table.int_col.hash(), 'fnv_hash(`int_col`)') ] self._check_expr_cases(cases) def test_reduction_where(self): cond = self.table.bigint_col < 70 c = self.table.double_col tmp = ('{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` ' 'ELSE NULL END)') cases = [ (c.sum(where=cond), tmp.format('sum')), (c.count(where=cond), tmp.format('count')), (c.mean(where=cond), tmp.format('avg')), (c.max(where=cond), tmp.format('max')), (c.min(where=cond), tmp.format('min')), (c.std(where=cond), tmp.format('stddev')), (c.std(where=cond, how='pop'), tmp.format('stddev_pop')), (c.var(where=cond), tmp.format('variance')), (c.var(where=cond, how='pop'), tmp.format('variance_pop')), ] self._check_expr_cases(cases) def test_reduction_invalid_where(self): condbad_literal = L('T') c = self.table.double_col for reduction in [c.sum, c.count, c.mean, c.max, c.min]: with self.assertRaises(TypeError): reduction(where=condbad_literal)
class TestExprFormatting(unittest.TestCase): # Uncertain about how much we want to commit to unit tests around the # particulars of the output at the moment. def setUp(self): self.schema = [('a', 'int8'), ('b', 'int16'), ('c', 'int32'), ('d', 'int64'), ('e', 'float'), ('f', 'double'), ('g', 'string'), ('h', 'boolean')] self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema) self.con = MockConnection() def test_format_table_column(self): # GH #507 result = repr(self.table.f) assert 'Column[array(double)]' in result def test_format_projection(self): # This should produce a ref to the projection proj = self.table[['c', 'a', 'f']] repr(proj['a']) def test_table_type_output(self): foo = ibis.table([('job', 'string'), ('dept_id', 'string'), ('year', 'int32'), ('y', 'double')], 'foo') expr = foo.dept_id == foo.view().dept_id result = repr(expr) assert 'SelfReference[table]' in result assert 'UnboundTable[table]' in result def test_memoize_aggregate_correctly(self): table = self.table agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis') agg_exprs = [ table['a'].sum().name('sum(a)'), table['b'].mean().name('mean(b)'), agg_expr ] result = table.aggregate(agg_exprs, by=['g']) formatter = ExprFormatter(result) formatted = formatter.get_result() alias = formatter.memo.get_alias(table) assert formatted.count(alias) == 7 def test_aggregate_arg_names(self): # Not sure how to test this *well* t = self.table by_exprs = [t.g.name('key1'), t.f.round().name('key2')] agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')] expr = self.table.group_by(by_exprs).aggregate(agg_exprs) result = repr(expr) assert 'metrics' in result assert 'by' in result def test_format_multiple_join_with_projection(self): # Star schema with fact table table = ibis.table([ ('c', 'int32'), ('f', 'double'), ('foo_id', 'string'), ('bar_id', 'string'), ], 'one') table2 = ibis.table([('foo_id', 'string'), ('value1', 'double')], 'two') table3 = ibis.table([('bar_id', 'string'), ('value2', 'double')], 'three') filtered = table[table['f'] > 0] pred1 = filtered['foo_id'] == table2['foo_id'] pred2 = filtered['bar_id'] == table3['bar_id'] j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields view = j2[[filtered, table2['value1'], table3['value2']]] # it works! repr(view) def test_memoize_database_table(self): table = self.con.table('test1') table2 = self.con.table('test2') filter_pred = table['f'] > 0 table3 = table[filter_pred] join_pred = table3['g'] == table2['key'] joined = table2.inner_join(table3, [join_pred]) met1 = (table3['f'] - table2['value']).mean().name('foo') result = joined.aggregate([met1, table3['f'].sum().name('bar')], by=[table3['g'], table2['key']]) formatted = repr(result) assert formatted.count('test1') == 1 assert formatted.count('test2') == 1 def test_memoize_filtered_table(self): airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) result = repr(delay_filter) assert result.count('Selection') == 1 def test_memoize_insert_sort_key(self): table = self.con.table('airlines') t = table['arrdelay', 'dest'] expr = (t.group_by('dest').mutate(dest_avg=t.arrdelay.mean(), dev=t.arrdelay - t.arrdelay.mean())) worst = (expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10)) result = repr(worst) assert result.count('airlines') == 1 def test_named_value_expr_show_name(self): expr = self.table.f * 2 expr2 = expr.name('baz') # it works! repr(expr) result2 = repr(expr2) # not really committing to a particular output yet assert 'baz' in result2 def test_memoize_filtered_tables_in_join(self): # related: GH #667 purchases = ibis.table([('region', 'string'), ('kind', 'string'), ('user', 'int64'), ('amount', 'double')], 'purchases') metric = purchases.amount.sum().name('total') agged = (purchases.group_by(['region', 'kind']).aggregate(metric)) left = agged[agged.kind == 'foo'] right = agged[agged.kind == 'bar'] cond = left.region == right.region joined = (left.join(right, cond)[left, right.total.name('right_total')]) result = repr(joined) # Join, and one for each aggregation assert result.count('predicates') == 3
class TestValueExprs(unittest.TestCase, ExprSQLTest): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') self.int_cols = ['a', 'b', 'c', 'd'] self.bool_cols = ['h'] self.float_cols = ['e', 'f'] def _check_literals(self, cases): for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_string_literals(self): cases = [ ('simple', "'simple'"), ('I can\'t', "'I can\\'t'"), ('An "escape"', "'An \"escape\"'") ] for value, expected in cases: lit_expr = L(value) result = self._translate(lit_expr) assert result == expected def test_decimal_builtins(self): t = self.con.table('tpch_lineitem') col = t.l_extendedprice cases = [ (col.precision(), 'precision(`l_extendedprice`)'), (col.scale(), 'scale(`l_extendedprice`)'), ] self._check_expr_cases(cases) def test_number_boolean_literals(self): cases = [ (5, '5'), (1.5, '1.5'), (True, 'TRUE'), (False, 'FALSE') ] self._check_literals(cases) def test_column_ref_table_aliases(self): context = ImpalaContext() table1 = ibis.table([ ('key1', 'string'), ('value1', 'double') ]) table2 = ibis.table([ ('key2', 'string'), ('value and2', 'double') ]) context.set_ref(table1, 't0') context.set_ref(table2, 't1') expr = table1['value1'] - table2['value and2'] result = self._translate(expr, context=context) expected = 't0.`value1` - t1.`value and2`' assert result == expected def test_column_ref_quoting(self): schema = [('has a space', 'double')] table = ibis.table(schema) self._translate(table['has a space'], '`has a space`') def test_identifier_quoting(self): schema = [('date', 'double'), ('table', 'string')] table = ibis.table(schema) self._translate(table['date'], '`date`') self._translate(table['table'], '`table`') def test_named_expressions(self): a, b, g = self.table.get_columns(['a', 'b', 'g']) cases = [ (g.cast('double').name('g_dub'), 'CAST(`g` AS double) AS `g_dub`'), (g.name('has a space'), '`g` AS `has a space`'), (((a - b) * a).name('expr'), '(`a` - `b`) * `a` AS `expr`') ] return self._check_expr_cases(cases, named=True) def test_binary_infix_operators(self): # For each function, verify that the generated code is what we expect a, b, h = self.table.get_columns(['a', 'b', 'h']) bool_col = a > 0 cases = [ (a + b, '`a` + `b`'), (a - b, '`a` - `b`'), (a * b, '`a` * `b`'), (a / b, '`a` / `b`'), (a ** b, 'pow(`a`, `b`)'), (a < b, '`a` < `b`'), (a <= b, '`a` <= `b`'), (a > b, '`a` > `b`'), (a >= b, '`a` >= `b`'), (a == b, '`a` = `b`'), (a != b, '`a` != `b`'), (h & bool_col, '`h` AND (`a` > 0)'), (h | bool_col, '`h` OR (`a` > 0)'), # xor is brute force (h ^ bool_col, '(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))') ] self._check_expr_cases(cases) def test_binary_infix_parenthesization(self): a, b, c = self.table.get_columns(['a', 'b', 'c']) cases = [ ((a + b) + c, '(`a` + `b`) + `c`'), (a.log() + c, 'ln(`a`) + `c`'), (b + (-(a + c)), '`b` + (-(`a` + `c`))') ] self._check_expr_cases(cases) def test_between(self): cases = [ (self.table.f.between(0, 1), '`f` BETWEEN 0 AND 1') ] self._check_expr_cases(cases) def test_isnull_notnull(self): cases = [ (self.table['g'].isnull(), '`g` IS NULL'), (self.table['a'].notnull(), '`a` IS NOT NULL'), ((self.table['a'] + self.table['b']).isnull(), '`a` + `b` IS NULL') ] self._check_expr_cases(cases) def test_casts(self): a, d, g = self.table.get_columns(['a', 'd', 'g']) cases = [ (a.cast('int16'), 'CAST(`a` AS smallint)'), (a.cast('int32'), 'CAST(`a` AS int)'), (a.cast('int64'), 'CAST(`a` AS bigint)'), (a.cast('float'), 'CAST(`a` AS float)'), (a.cast('double'), 'CAST(`a` AS double)'), (a.cast('string'), 'CAST(`a` AS string)'), (d.cast('int8'), 'CAST(`d` AS tinyint)'), (g.cast('double'), 'CAST(`g` AS double)'), (g.cast('timestamp'), 'CAST(`g` AS timestamp)') ] self._check_expr_cases(cases) def test_misc_conditionals(self): a = self.table.a cases = [ (a.nullif(0), 'nullif(`a`, 0)') ] self._check_expr_cases(cases) def test_decimal_casts(self): cases = [ (L('9.9999999').cast('decimal(38,5)'), "CAST('9.9999999' AS decimal(38,5))"), (self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))") ] self._check_expr_cases(cases) def test_negate(self): cases = [ (-self.table['a'], '-`a`'), (-self.table['f'], '-`f`'), (-self.table['h'], 'NOT `h`') ] self._check_expr_cases(cases) def test_timestamp_extract_field(self): fields = ['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond'] cases = [(getattr(self.table.i, field)(), "extract(`i`, '{0}')".format(field)) for field in fields] self._check_expr_cases(cases) # integration with SQL translation expr = self.table[self.table.i.year().name('year'), self.table.i.month().name('month'), self.table.i.day().name('day')] result = to_sql(expr) expected = \ """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`, extract(`i`, 'day') AS `day` FROM alltypes""" assert result == expected def test_timestamp_now(self): cases = [ (ibis.now(), 'now()') ] self._check_expr_cases(cases) def test_timestamp_deltas(self): units = ['year', 'month', 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'] t = self.table.i f = '`i`' cases = [] for unit in units: K = 5 offset = getattr(ibis, unit)(K) template = '{0}s_add({1}, {2})' cases.append((t + offset, template.format(unit, f, K))) cases.append((t - offset, template.format(unit, f, -K))) self._check_expr_cases(cases) def test_timestamp_literals(self): from pandas import Timestamp tv1 = '2015-01-01 12:34:56' ex1 = ("'2015-01-01 12:34:56'") cases = [ (L(Timestamp(tv1)), ex1), (L(Timestamp(tv1).to_pydatetime()), ex1), (ibis.timestamp(tv1), ex1) ] self._check_expr_cases(cases) def test_timestamp_from_integer(self): col = self.table.c cases = [ (col.to_timestamp(), 'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), (col.to_timestamp('ms'), 'CAST(from_unixtime(CAST(`c` / 1000 AS int), ' '"yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), (col.to_timestamp('us'), 'CAST(from_unixtime(CAST(`c` / 1000000 AS int), ' '"yyyy-MM-dd HH:mm:ss") ' 'AS timestamp)'), ] self._check_expr_cases(cases) def test_correlated_predicate_subquery(self): t0 = self.table t1 = t0.view() expr = t0.g == t1.g ctx = ImpalaContext() ctx.make_alias(t0) # Grab alias from parent context subctx = ctx.subcontext() subctx.make_alias(t1) subctx.make_alias(t0) result = self._translate(expr, context=subctx) expected = "t0.`g` = t1.`g`" assert result == expected def test_any_all(self): t = self.table bool_expr = t.f == 0 cases = [ (bool_expr.any(), 'sum(`f` = 0) > 0'), (-bool_expr.any(), 'sum(`f` = 0) = 0'), (bool_expr.all(), 'sum(`f` = 0) = count(*)'), (-bool_expr.all(), 'sum(`f` = 0) < count(*)'), ] self._check_expr_cases(cases)
class TestFixedOffsets(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('alltypes') def test_upconvert(self): cases = [ (T.day(14), 'w', T.week(2)), (T.hour(72), 'd', T.day(3)), (T.minute(240), 'h', T.hour(4)), (T.second(360), 'm', T.minute(6)), (T.second(3 * 86400), 'd', T.day(3)), (T.millisecond(5000), 's', T.second(5)), (T.microsecond(5000000), 's', T.second(5)), (T.nanosecond(5000000000), 's', T.second(5)), ] for offset, unit, expected in cases: result = offset.to_unit(unit) assert result.equals(expected) def test_multiply(self): offset = T.day(2) assert (offset * 2).equals(T.day(4)) assert (offset * (-2)).equals(T.day(-4)) assert (3 * offset).equals(T.day(6)) assert ((-3) * offset).equals(T.day(-6)) def test_repr(self): assert repr(T.day()) == '<Timedelta: 1 day>' assert repr(T.day(2)) == '<Timedelta: 2 days>' assert repr(T.year()) == '<Timedelta: 1 year>' assert repr(T.month(2)) == '<Timedelta: 2 months>' assert repr(T.second(40)) == '<Timedelta: 40 seconds>' def test_cannot_upconvert(self): cases = [ (T.day(), 'w'), (T.hour(), 'd'), (T.minute(), 'h'), (T.second(), 'm'), (T.second(), 'd'), (T.millisecond(), 's'), (T.microsecond(), 's'), (T.nanosecond(), 's'), ] for delta, target in cases: self.assertRaises(IbisError, delta.to_unit, target) def test_downconvert_second_parts(self): K = 2 sec = T.second(K) milli = T.millisecond(K) micro = T.microsecond(K) nano = T.nanosecond(K) cases = [(sec.to_unit('s'), T.second(K)), (sec.to_unit('ms'), T.millisecond(K * 1000)), (sec.to_unit('us'), T.microsecond(K * 1000000)), (sec.to_unit('ns'), T.nanosecond(K * 1000000000)), (milli.to_unit('ms'), T.millisecond(K)), (milli.to_unit('us'), T.microsecond(K * 1000)), (milli.to_unit('ns'), T.nanosecond(K * 1000000)), (micro.to_unit('us'), T.microsecond(K)), (micro.to_unit('ns'), T.nanosecond(K * 1000)), (nano.to_unit('ns'), T.nanosecond(K))] self._check_cases(cases) def test_downconvert_hours(self): K = 2 offset = T.hour(K) cases = [(offset.to_unit('h'), T.hour(K)), (offset.to_unit('m'), T.minute(K * 60)), (offset.to_unit('s'), T.second(K * 3600)), (offset.to_unit('ms'), T.millisecond(K * 3600000)), (offset.to_unit('us'), T.microsecond(K * 3600000000)), (offset.to_unit('ns'), T.nanosecond(K * 3600000000000))] self._check_cases(cases) def test_downconvert_day(self): K = 2 week = T.week(K) day = T.day(K) cases = [(week.to_unit('d'), T.day(K * 7)), (week.to_unit('h'), T.hour(K * 7 * 24)), (day.to_unit('d'), T.day(K)), (day.to_unit('h'), T.hour(K * 24)), (day.to_unit('m'), T.minute(K * 1440)), (day.to_unit('s'), T.second(K * 86400)), (day.to_unit('ms'), T.millisecond(K * 86400000)), (day.to_unit('us'), T.microsecond(K * 86400000000)), (day.to_unit('ns'), T.nanosecond(K * 86400000000000))] self._check_cases(cases) def test_combine_with_different_kinds(self): cases = [(T.day() + T.minute(), T.minute(1441)), (T.second() + T.millisecond(10), T.millisecond(1010)), (T.hour() + T.minute(5) + T.second(10), T.second(3910))] self._check_cases(cases) def test_timedelta_generic_api(self): cases = [ (T.timedelta(weeks=2), T.week(2)), (T.timedelta(days=3), T.day(3)), (T.timedelta(hours=4), T.hour(4)), (T.timedelta(minutes=5), T.minute(5)), (T.timedelta(seconds=6), T.second(6)), (T.timedelta(milliseconds=7), T.millisecond(7)), (T.timedelta(microseconds=8), T.microsecond(8)), (T.timedelta(nanoseconds=9), T.nanosecond(9)), ] self._check_cases(cases) def _check_cases(self, cases): for x, y in cases: assert x.equals(y) def test_offset_timestamp_expr(self): c = self.table.i x = T.timedelta(days=1) expr = x + c assert isinstance(expr, ir.TimestampColumn) assert isinstance(expr.op(), ops.TimestampDelta) # test radd expr = c + x assert isinstance(expr, ir.TimestampColumn) assert isinstance(expr.op(), ops.TimestampDelta)
class UDFTest(unittest.TestCase): def setUp(self): self.con = MockConnection() self.table = self.con.table('functional_alltypes') self.i8 = self.table.tinyint_col self.i16 = self.table.smallint_col self.i32 = self.table.int_col self.i64 = self.table.bigint_col self.d = self.table.double_col self.f = self.table.float_col self.s = self.table.string_col self.b = self.table.bool_col self.t = self.table.timestamp_col self.dec = self.con.table('tpch_customer').c_acctbal self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t] def test_sql_generation(self): op = udf.scalar_function(['string'], 'string', name='Tester') udf.add_impala_operation(op, 'identity', 'udf_testing') def _identity_test(value): return op(value).to_expr() result = _identity_test('hello world') assert result == "SELECT udf_testing.identity('hello world')" def test_sql_generation_from_infoclass(self): udf_info = udf.UDFCreator('test.so', ['string'], 'string', 'info_test') op = udf_info.to_operation() udf.add_impala_operation(op, 'info_test', 'udf_testing') assert op in _operation_registry def _infoclass_test(value): return op(value).to_expr() result = _infoclass_test('hello world') assert result == "SELECT udf_testing.info_test('hello world')" def test_boolean_wrapping(self): func = self._udf_registration_single_input('boolean', 'boolean', 'test') expr = func(True) assert type(expr) == ir.BooleanScalar expr = func(self.b) assert type(expr) == ir.BooleanArray def test_tinyint_wrapping(self): func = self._udf_registration_single_input('int8', 'int8', 'test') expr = func(1) assert type(expr) == ir.Int8Scalar expr = func(self.i8) assert type(expr) == ir.Int8Array def test_smallint_wrapping(self): func = self._udf_registration_single_input('int16', 'int16', 'test') expr = func(1) assert type(expr) == ir.Int16Scalar expr = func(self.i16) assert type(expr) == ir.Int16Array def test_int_wrapping(self): func = self._udf_registration_single_input('int32', 'int32', 'test') expr = func(1) assert type(expr) == ir.Int32Scalar expr = func(self.i32) assert type(expr) == ir.Int32Array def test_bigint_wrapping(self): func = self._udf_registration_single_input('int64', 'int64', 'test') expr = func(1) assert type(expr) == ir.Int64Scalar expr = func(self.i64) assert type(expr) == ir.Int64Array def test_float_wrapping(self): func = self._udf_registration_single_input('float', 'float', 'test') expr = func(1.0) assert type(expr) == ir.FloatScalar expr = func(self.f) assert type(expr) == ir.FloatArray def test_double_wrapping(self): func = self._udf_registration_single_input('double', 'double', 'test') expr = func(1.0) assert type(expr) == ir.DoubleScalar expr = func(self.d) assert type(expr) == ir.DoubleArray def test_decimal_wrapping(self): func = self._udf_registration_single_input('decimal(9,0)', 'decimal(9,0)', 'test') expr = func(1.0) assert type(expr) == ir.DecimalScalar expr = func(self.dec) assert type(expr) == ir.DecimalArray def test_string_wrapping(self): func = self._udf_registration_single_input('string', 'string', 'test') expr = func('1') assert type(expr) == ir.StringScalar expr = func(self.s) assert type(expr) == ir.StringArray def test_timestamp_wrapping(self): func = self._udf_registration_single_input('timestamp', 'timestamp', 'test') expr = func(ibis.timestamp('1961-04-10')) assert type(expr) == ir.TimestampScalar expr = func(self.t) assert type(expr) == ir.TimestampArray def test_invalid_typecasting_tinyint(self): self._invalid_typecasts('int8', self.all_cols[1:]) def test_invalid_typecasting_smallint(self): self._invalid_typecasts('int16', self.all_cols[2:]) def test_invalid_typecasting_int(self): self._invalid_typecasts('int32', self.all_cols[3:]) def test_invalid_typecasting_bigint(self): self._invalid_typecasts('int64', self.all_cols[4:]) def test_invalid_typecasting_boolean(self): self._invalid_typecasts('boolean', self.all_cols[:8] + self.all_cols[9:]) def test_invalid_typecasting_float(self): self._invalid_typecasts('float', self.all_cols[:4] + self.all_cols[6:]) def test_invalid_typecasting_double(self): self._invalid_typecasts('double', self.all_cols[:4] + self.all_cols[6:]) def test_invalid_typecasting_string(self): self._invalid_typecasts('string', self.all_cols[:7] + self.all_cols[8:]) def test_invalid_typecasting_timestamp(self): self._invalid_typecasts('timestamp', self.all_cols[:-1]) def test_invalid_typecasting_decimal(self): self._invalid_typecasts('decimal', self.all_cols[:4] + self.all_cols[7:]) def test_mult_args(self): op = self._udf_registration(['int32', 'double', 'string', 'boolean', 'timestamp'], 'int64', 'mult_types') def _func(integer, double, string, boolean, timestamp): return op(integer, double, string, boolean, timestamp).to_expr() expr = _func(self.i32, self.d, self.s, self.b, self.t) assert issubclass(type(expr), ir.ArrayExpr) expr = _func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10')) assert issubclass(type(expr), ir.ScalarExpr) def _udf_registration_single_input(self, inputs, output, name): op = self._udf_registration([inputs], output, name) def _test_func(value): return op(value).to_expr() return _test_func def _udf_registration(self, inputs, output, name): op = udf.scalar_function(inputs, output, name=name) assert issubclass(op, ValueOp) udf.add_impala_operation(op, name, 'ibis_testing') return op def _invalid_typecasts(self, inputs, invalid_casts): func = self._udf_registration_single_input(inputs, 'int32', 'typecast') for in_type in invalid_casts: self.assertRaises(IbisTypeError, func, in_type)
class TestTimestamp(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('alltypes') self.col = self.alltypes.i def test_field_select(self): assert isinstance(self.col, ir.TimestampArray) def test_string_cast_to_timestamp(self): casted = self.alltypes.g.cast('timestamp') assert isinstance(casted, ir.TimestampArray) string = api.literal('2000-01-01') casted = string.cast('timestamp') assert isinstance(casted, ir.TimestampScalar) def test_extract_fields(self): # type-size may be database specific cases = [ ('year', ops.ExtractYear, ir.Int32Array), ('month', ops.ExtractMonth, ir.Int32Array), ('day', ops.ExtractDay, ir.Int32Array), ('hour', ops.ExtractHour, ir.Int32Array), ('minute', ops.ExtractMinute, ir.Int32Array), ('second', ops.ExtractSecond, ir.Int32Array), ('millisecond', ops.ExtractMillisecond, ir.Int32Array), ] for attr, ex_op, ex_type in cases: result = getattr(self.col, attr)() assert result.get_name() == attr assert isinstance(result, ex_type) assert isinstance(result.op(), ex_op) def test_now(self): result = api.now() assert isinstance(result, ir.TimestampScalar) assert isinstance(result.op(), ops.TimestampNow) def test_timestamp_literals(self): ts_str = '2015-01-01 00:00:00' val = pd.Timestamp(ts_str) expr = ibis.literal(val) assert isinstance(expr, ir.TimestampScalar) expr = ibis.timestamp(ts_str) assert isinstance(expr, ir.TimestampScalar) self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71') def test_integer_to_timestamp(self): # #246 pass def test_comparison_timestamp(self): expr = self.col > (self.col.min() + ibis.day(3)) assert isinstance(expr, ir.BooleanArray) def test_comparisons_string(self): val = '2015-01-01 00:00:00' expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar) expr2 = val < self.col op = expr2.op() assert isinstance(op, ops.Greater) assert isinstance(op.right, ir.TimestampScalar) def test_comparisons_pandas_timestamp(self): val = pd.Timestamp('2015-01-01 00:00:00') expr = self.col > val op = expr.op() assert isinstance(op.right, ir.TimestampScalar)
class TestSelectSQL(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_nameless_table(self): # Ensure that user gets some kind of sensible error nameless = api.table([('key', 'string')]) self.assertRaises(com.RelationError, to_sql, nameless) with_name = api.table([('key', 'string')], name='baz') result = to_sql(with_name) assert result == 'SELECT *\nFROM baz' def test_physical_table_reference_translate(self): # If an expression's table leaves all reference database tables, verify # we translate correctly table = self.con.table('alltypes') query = _get_query(table) sql_string = query.compile() expected = "SELECT *\nFROM alltypes" assert sql_string == expected def test_simple_join_formatting(self): t1 = self.con.table('star1') t2 = self.con.table('star2') pred = t1['foo_id'] == t2['foo_id'] pred2 = t1['bar_id'] == t2['foo_id'] cases = [ (t1.inner_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), (t1.left_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 LEFT OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), (t1.outer_join(t2, [pred])[[t1]], """SELECT t0.* FROM star1 t0 FULL OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`"""), # multiple predicates (t1.inner_join(t2, [pred, pred2])[[t1]], """SELECT t0.* FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` AND t0.`bar_id` = t1.`foo_id`"""), ] for expr, expected_sql in cases: result_sql = to_sql(expr) assert result_sql == expected_sql def test_multiple_join_cases(self): t1 = self.con.table('star1') t2 = self.con.table('star2') t3 = self.con.table('star3') predA = t1['foo_id'] == t2['foo_id'] predB = t1['bar_id'] == t3['bar_id'] what = (t1.left_join(t2, [predA]) .inner_join(t3, [predB]) .projection([t1, t2['value1'], t3['value2']])) result_sql = to_sql(what) expected_sql = """SELECT t0.*, t1.`value1`, t2.`value2` FROM star1 t0 LEFT OUTER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` INNER JOIN star3 t2 ON t0.`bar_id` = t2.`bar_id`""" assert result_sql == expected_sql def test_join_between_joins(self): t1 = api.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double'), ], 'first') t2 = api.table([ ('key1', 'string'), ('value2', 'double'), ], 'second') t3 = api.table([ ('key2', 'string'), ('key3', 'string'), ('value3', 'double'), ], 'third') t4 = api.table([ ('key3', 'string'), ('value4', 'double') ], 'fourth') left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2] right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4] joined = left.inner_join(right, [('key2', 'key2')]) # At one point, the expression simplification was resulting in bad refs # here (right.value3 referencing the table inside the right join) exprs = [left, right.value3, right.value4] projected = joined.projection(exprs) result = to_sql(projected) expected = """SELECT t0.*, t1.`value3`, t1.`value4` FROM ( SELECT t2.*, t3.`value2` FROM `first` t2 INNER JOIN second t3 ON t2.`key1` = t3.`key1` ) t0 INNER JOIN ( SELECT t2.*, t3.`value4` FROM third t2 INNER JOIN fourth t3 ON t2.`key3` = t3.`key3` ) t1 ON t0.`key2` = t1.`key2`""" assert result == expected def test_join_just_materialized(self): t1 = self.con.table('tpch_nation') t2 = self.con.table('tpch_region') t3 = self.con.table('tpch_customer') # GH #491 joined = (t1.inner_join(t2, t1.n_regionkey == t2.r_regionkey) .inner_join(t3, t1.n_nationkey == t3.c_nationkey)) result = to_sql(joined) expected = """SELECT * FROM tpch_nation t0 INNER JOIN tpch_region t1 ON t0.`n_regionkey` = t1.`r_regionkey` INNER JOIN tpch_customer t2 ON t0.`n_nationkey` = t2.`c_nationkey`""" assert result == expected result = to_sql(joined.materialize()) assert result == expected def test_join_no_predicates_for_impala(self): # Impala requires that joins without predicates be written explicitly # as CROSS JOIN, since result sets can accidentally get too large if a # query is executed before predicates are written t1 = self.con.table('star1') t2 = self.con.table('star2') joined2 = t1.cross_join(t2)[[t1]] expected = """SELECT t0.* FROM star1 t0 CROSS JOIN star2 t1""" result2 = to_sql(joined2) assert result2 == expected for jtype in ['inner_join', 'left_join', 'outer_join']: joined = getattr(t1, jtype)(t2)[[t1]] result = to_sql(joined) assert result == expected def test_semi_anti_joins(self): t1 = self.con.table('star1') t2 = self.con.table('star2') joined = t1.semi_join(t2, [t1.foo_id == t2.foo_id])[[t1]] result = to_sql(joined) expected = """SELECT t0.* FROM star1 t0 LEFT SEMI JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected joined = t1.anti_join(t2, [t1.foo_id == t2.foo_id])[[t1]] result = to_sql(joined) expected = """SELECT t0.* FROM star1 t0 LEFT ANTI JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_self_reference_simple(self): t1 = self.con.table('star1') result_sql = to_sql(t1.view()) expected_sql = "SELECT *\nFROM star1" assert result_sql == expected_sql def test_join_self_reference(self): t1 = self.con.table('star1') t2 = t1.view() result = t1.inner_join(t2, [t1.foo_id == t2.bar_id])[[t1]] result_sql = to_sql(result) expected_sql = """SELECT t0.* FROM star1 t0 INNER JOIN star1 t1 ON t0.`foo_id` = t1.`bar_id`""" assert result_sql == expected_sql def test_join_projection_subquery_broken_alias(self): # From an observed bug, derived from tpch tables geo = (nation.inner_join(region, [('n_regionkey', 'r_regionkey')]) [nation.n_nationkey, nation.n_name.name('nation'), region.r_name.name('region')]) expr = (geo.inner_join(customer, [('n_nationkey', 'c_nationkey')]) [customer, geo]) result = to_sql(expr) expected = """SELECT t1.*, t0.* FROM ( SELECT t2.`n_nationkey`, t2.`n_name` AS `nation`, t3.`r_name` AS `region` FROM nation t2 INNER JOIN region t3 ON t2.`n_regionkey` = t3.`r_regionkey` ) t0 INNER JOIN customer t1 ON t0.`n_nationkey` = t1.`c_nationkey`""" assert result == expected def test_where_simple_comparisons(self): t1 = self.con.table('star1') what = t1.filter([t1.f > 0, t1.c < t1.f * 2]) result = to_sql(what) expected = """SELECT * FROM star1 WHERE `f` > 0 AND `c` < (`f` * 2)""" assert result == expected def test_where_in_array_literal(self): # e.g. # where string_col in (v1, v2, v3) raise unittest.SkipTest def test_where_with_join(self): t1 = self.con.table('star1') t2 = self.con.table('star2') # This also tests some cases of predicate pushdown what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]) .projection([t1, t2.value1, t2.value3]) .filter([t1.f > 0, t2.value3 < 1000])) what2 = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]) .filter([t1.f > 0, t2.value3 < 1000]) .projection([t1, t2.value1, t2.value3])) expected_sql = """SELECT t0.*, t1.`value1`, t1.`value3` FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` WHERE t0.`f` > 0 AND t1.`value3` < 1000""" result_sql = to_sql(what) assert result_sql == expected_sql result2_sql = to_sql(what2) assert result2_sql == expected_sql def test_where_no_pushdown_possible(self): t1 = self.con.table('star1') t2 = self.con.table('star2') joined = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]) [t1, (t1.f - t2.value1).name('diff')]) filtered = joined[joined.diff > 1] # TODO: I'm not sure if this is exactly what we want expected_sql = """SELECT * FROM ( SELECT t0.*, t0.`f` - t1.`value1` AS `diff` FROM star1 t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id` WHERE t0.`f` > 0 AND t1.`value3` < 1000 ) WHERE `diff` > 1""" raise unittest.SkipTest result_sql = to_sql(filtered) assert result_sql == expected_sql def test_where_with_between(self): t = self.con.table('alltypes') what = t.filter([t.a > 0, t.f.between(0, 1)]) result = to_sql(what) expected = """SELECT * FROM alltypes WHERE `a` > 0 AND `f` BETWEEN 0 AND 1""" assert result == expected def test_where_analyze_scalar_op(self): # root cause of #310 table = self.con.table('functional_alltypes') expr = (table.filter([table.timestamp_col < (ibis.timestamp('2010-01-01') + ibis.month(3)), table.timestamp_col < (ibis.now() + ibis.day(10))]) .count()) result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND `timestamp_col` < days_add(now(), 10)""" assert result == expected def test_simple_aggregate_query(self): t1 = self.con.table('star1') cases = [ (t1.aggregate([t1['f'].sum().name('total')], [t1['foo_id']]), """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1"""), (t1.aggregate([t1['f'].sum().name('total')], ['foo_id', 'bar_id']), """SELECT `foo_id`, `bar_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1, 2""") ] for expr, expected_sql in cases: result_sql = to_sql(expr) assert result_sql == expected_sql def test_aggregate_having(self): # Filtering post-aggregation predicate t1 = self.con.table('star1') total = t1.f.sum().name('total') metrics = [total] expr = t1.aggregate(metrics, by=['foo_id'], having=[total > 10]) result = to_sql(expr) expected = """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 HAVING sum(`f`) > 10""" assert result == expected expr = t1.aggregate(metrics, by=['foo_id'], having=[t1.count() > 100]) result = to_sql(expr) expected = """SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 HAVING count(*) > 100""" assert result == expected def test_aggregate_table_count_metric(self): expr = self.con.table('star1').count() result = to_sql(expr) expected = """SELECT count(*) AS `tmp` FROM star1""" assert result == expected # count on more complicated table region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') join_expr = region.r_regionkey == nation.n_regionkey joined = region.inner_join(nation, join_expr) table_ref = joined[nation, region.r_name.name('region')] expr = table_ref.count() result = to_sql(expr) expected = """SELECT count(*) AS `tmp` FROM ( SELECT t2.*, t1.`r_name` AS `region` FROM tpch_region t1 INNER JOIN tpch_nation t2 ON t1.`r_regionkey` = t2.`n_regionkey` ) t0""" assert result == expected def test_expr_template_field_name_binding(self): # Given an expression with no concrete links to actual database tables, # indicate a mapping between the distinct unbound table leaves of the # expression and some database tables with compatible schemas but # potentially different column names pass def test_no_aliases_needed(self): table = api.table([ ('key1', 'string'), ('key2', 'string'), ('value', 'double') ]) expr = table.aggregate([table['value'].sum().name('total')], by=['key1', 'key2']) query = _get_query(expr) context = query.context assert not context.need_aliases() def test_table_names_overlap_default_aliases(self): # see discussion in #104; this actually is not needed for query # correctness, and only makes the generated SQL nicer raise unittest.SkipTest t0 = api.table([ ('key', 'string'), ('v1', 'double') ], 't1') t1 = api.table([ ('key', 'string'), ('v2', 'double') ], 't0') expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2] result = to_sql(expr) expected = """\ SELECT t2.`key`, t2.`v1`, t3.`v2` FROM t0 t2 INNER JOIN t1 t3 ON t2.`key` = t3.`key`""" assert result == expected def test_context_aliases_multiple_join(self): t1 = self.con.table('star1') t2 = self.con.table('star2') t3 = self.con.table('star3') expr = (t1.left_join(t2, [t1['foo_id'] == t2['foo_id']]) .inner_join(t3, [t1['bar_id'] == t3['bar_id']]) [[t1, t2['value1'], t3['value2']]]) query = _get_query(expr) context = query.context assert context.get_alias(t1) == 't0' assert context.get_alias(t2) == 't1' assert context.get_alias(t3) == 't2' def test_fuse_projections(self): table = api.table([ ('foo', 'int32'), ('bar', 'int64'), ('value', 'double') ], name='tbl') # Cases where we project in both cases using the base table reference f1 = (table['foo'] + table['bar']).name('baz') pred = table['value'] > 0 table2 = table[table, f1] table2_filtered = table2[pred] f2 = (table2['foo'] * 2).name('qux') f3 = (table['foo'] * 2).name('qux') table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl""" ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl WHERE `value` > 0""" table3_sql = to_sql(table3) table3_filt_sql = to_sql(table3_filtered) assert table3_sql == ex_sql assert table3_filt_sql == ex_sql2 # Use the intermediate table refs table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) def test_bug_project_multiple_times(self): # 108 customer = self.con.table('tpch_customer') nation = self.con.table('tpch_nation') region = self.con.table('tpch_region') joined = ( customer.inner_join(nation, [customer.c_nationkey == nation.n_nationkey]) .inner_join(region, [nation.n_regionkey == region.r_regionkey]) ) proj1 = [customer, nation.n_name, region.r_name] step1 = joined[proj1] topk_by = step1.c_acctbal.cast('double').sum() pred = step1.n_name.topk(10, by=topk_by) proj_exprs = [step1.c_name, step1.r_name, step1.n_name] step2 = step1[pred] expr = step2.projection(proj_exprs) # it works! result = to_sql(expr) expected = """\ SELECT `c_name`, `r_name`, `n_name` FROM ( SELECT t1.*, t2.`n_name`, t3.`r_name` FROM tpch_customer t1 INNER JOIN tpch_nation t2 ON t1.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_region t3 ON t2.`n_regionkey` = t3.`r_regionkey` LEFT SEMI JOIN ( SELECT t2.`n_name`, sum(CAST(t1.`c_acctbal` AS double)) AS `sum` FROM tpch_customer t1 INNER JOIN tpch_nation t2 ON t1.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_region t3 ON t2.`n_regionkey` = t3.`r_regionkey` GROUP BY 1 ORDER BY `sum` DESC LIMIT 10 ) t4 ON t2.`n_name` = t4.`n_name` ) t0""" assert result == expected def test_aggregate_projection_subquery(self): t = self.con.table('alltypes') proj = t[t.f > 0][t, (t.a + t.b).name('foo')] def agg(x): return x.aggregate([x.foo.sum().name('foo total')], by=['g']) # predicate gets pushed down filtered = proj[proj.g == 'bar'] result = to_sql(filtered) expected = """SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 AND `g` = 'bar'""" assert result == expected agged = agg(filtered) result = to_sql(agged) expected = """SELECT `g`, sum(`foo`) AS `foo total` FROM ( SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 AND `g` = 'bar' ) t0 GROUP BY 1""" assert result == expected # Pushdown is not possible (in Impala, Postgres, others) agged2 = agg(proj[proj.foo < 10]) result = to_sql(agged2) expected = """SELECT t0.`g`, sum(t0.`foo`) AS `foo total` FROM ( SELECT *, `a` + `b` AS `foo` FROM alltypes WHERE `f` > 0 ) t0 WHERE t0.`foo` < 10 GROUP BY 1""" assert result == expected def test_subquery_aliased(self): t1 = self.con.table('star1') t2 = self.con.table('star2') agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id']) what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id]) [agged, t2.value1]) result = to_sql(what) expected = """SELECT t0.*, t1.`value1` FROM ( SELECT `foo_id`, sum(`f`) AS `total` FROM star1 GROUP BY 1 ) t0 INNER JOIN star2 t1 ON t0.`foo_id` = t1.`foo_id`""" assert result == expected def test_double_nested_subquery_no_aliases(self): # We don't require any table aliasing anywhere t = api.table([ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value', 'double') ], 'foo_table') agg1 = t.aggregate([t.value.sum().name('total')], by=['key1', 'key2', 'key3']) agg2 = agg1.aggregate([agg1.total.sum().name('total')], by=['key1', 'key2']) agg3 = agg2.aggregate([agg2.total.sum().name('total')], by=['key1']) result = to_sql(agg3) expected = """SELECT `key1`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total` FROM foo_table GROUP BY 1, 2, 3 ) t1 GROUP BY 1, 2 ) t0 GROUP BY 1""" assert result == expected def test_aggregate_projection_alias_bug(self): # Observed in use t1 = self.con.table('star1') t2 = self.con.table('star2') what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]) [[t1, t2.value1]]) what = what.aggregate([what.value1.sum().name('total')], by=[what.foo_id]) # TODO: Not fusing the aggregation with the projection yet result = to_sql(what) expected = """SELECT `foo_id`, sum(`value1`) AS `total` FROM ( SELECT t1.*, t2.`value1` FROM star1 t1 INNER JOIN star2 t2 ON t1.`foo_id` = t2.`foo_id` ) t0 GROUP BY 1""" assert result == expected def test_aggregate_fuse_with_projection(self): # see above test case pass def test_subquery_used_for_self_join(self): # There could be cases that should look in SQL like # WITH t0 as (some subquery) # select ... # from t0 t1 # join t0 t2 # on t1.kind = t2.subkind # ... # However, the Ibis code will simply have an expression (projection or # aggregation, say) built on top of the subquery expression, so we need # to extract the subquery unit (we see that it appears multiple times # in the tree). t = self.con.table('alltypes') agged = t.aggregate([t.f.sum().name('total')], by=['g', 'a', 'b']) view = agged.view() metrics = [(agged.total - view.total).max().name('metric')] reagged = (agged.inner_join(view, [agged.a == view.b]) .aggregate(metrics, by=[agged.g])) result = to_sql(reagged) expected = """WITH t0 AS ( SELECT `g`, `a`, `b`, sum(`f`) AS `total` FROM alltypes GROUP BY 1, 2, 3 ) SELECT t0.`g`, max(t0.`total` - t1.`total`) AS `metric` FROM t0 INNER JOIN t0 t1 ON t0.`a` = t1.`b` GROUP BY 1""" assert result == expected def test_subquery_factor_correlated_subquery(self): # #173, #183 and other issues region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') customer = self.con.table('tpch_customer') orders = self.con.table('tpch_orders') fields_of_interest = [customer, region.r_name.name('region'), orders.o_totalprice.name('amount'), orders.o_orderdate .cast('timestamp').name('odate')] tpch = (region.join(nation, region.r_regionkey == nation.n_regionkey) .join(customer, customer.c_nationkey == nation.n_nationkey) .join(orders, orders.o_custkey == customer.c_custkey) [fields_of_interest]) # Self-reference + correlated subquery complicates things t2 = tpch.view() conditional_avg = t2[t2.region == tpch.region].amount.mean() amount_filter = tpch.amount > conditional_avg expr = tpch[amount_filter].limit(10) result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT t5.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`, CAST(t3.`o_orderdate` AS timestamp) AS `odate` FROM tpch_region t1 INNER JOIN tpch_nation t2 ON t1.`r_regionkey` = t2.`n_regionkey` INNER JOIN tpch_customer t5 ON t5.`c_nationkey` = t2.`n_nationkey` INNER JOIN tpch_orders t3 ON t3.`o_custkey` = t5.`c_custkey` ) SELECT t0.* FROM t0 WHERE t0.`amount` > ( SELECT avg(t4.`amount`) AS `tmp` FROM t0 t4 WHERE t4.`region` = t0.`region` ) LIMIT 10""" assert result == expected def test_self_join_subquery_distinct_equal(self): region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') j1 = (region.join(nation, region.r_regionkey == nation.n_regionkey) [region, nation]) j2 = (region.join(nation, region.r_regionkey == nation.n_regionkey) [region, nation].view()) expr = (j1.join(j2, j1.r_regionkey == j2.r_regionkey) [j1.r_name, j2.n_name]) result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT t2.*, t3.* FROM tpch_region t2 INNER JOIN tpch_nation t3 ON t2.`r_regionkey` = t3.`n_regionkey` ) SELECT t0.`r_name`, t1.`n_name` FROM t0 INNER JOIN t0 t1 ON t0.`r_regionkey` = t1.`r_regionkey`""" assert result == expected def test_limit_with_self_join(self): t = self.con.table('functional_alltypes') t2 = t.view() expr = t.join(t2, t.tinyint_col < t2.timestamp_col.minute()).count() # it works result = to_sql(expr) expected = """\ SELECT count(*) AS `tmp` FROM functional_alltypes t0 INNER JOIN functional_alltypes t1 ON t0.`tinyint_col` < extract(t1.`timestamp_col`, 'minute')""" assert result == expected def test_cte_factor_distinct_but_equal(self): t = self.con.table('alltypes') tt = self.con.table('alltypes') expr1 = t.group_by('g').aggregate(t.f.sum().name('metric')) expr2 = tt.group_by('g').aggregate(tt.f.sum().name('metric')).view() expr = expr1.join(expr2, expr1.g == expr2.g)[[expr1]] result = to_sql(expr) expected = """\ WITH t0 AS ( SELECT `g`, sum(`f`) AS `metric` FROM alltypes GROUP BY 1 ) SELECT t0.* FROM t0 INNER JOIN t0 t1 ON t0.`g` = t1.`g`""" assert result == expected def test_tpch_self_join_failure(self): # duplicating the integration test here region = self.con.table('tpch_region') nation = self.con.table('tpch_nation') customer = self.con.table('tpch_customer') orders = self.con.table('tpch_orders') fields_of_interest = [ region.r_name.name('region'), nation.n_name.name('nation'), orders.o_totalprice.name('amount'), orders.o_orderdate.cast('timestamp').name('odate')] joined_all = ( region.join(nation, region.r_regionkey == nation.n_regionkey) .join(customer, customer.c_nationkey == nation.n_nationkey) .join(orders, orders.o_custkey == customer.c_custkey) [fields_of_interest]) year = joined_all.odate.year().name('year') total = joined_all.amount.sum().cast('double').name('total') annual_amounts = (joined_all .group_by(['region', year]) .aggregate(total)) current = annual_amounts prior = annual_amounts.view() yoy_change = (current.total - prior.total).name('yoy_change') yoy = (current.join(prior, current.year == (prior.year - 1)) [current.region, current.year, yoy_change]) to_sql(yoy) def test_extract_subquery_nested_lower(self): # We may have a join between two tables requiring subqueries, and # buried inside these there may be a common subquery. Let's test that # we find it and pull it out to the top level to avoid repeating # ourselves. pass def test_subquery_in_filter_predicate(self): # E.g. comparing against some scalar aggregate value. See Ibis #43 t1 = self.con.table('star1') pred = t1.f > t1.f.mean() expr = t1[pred] # This brought out another expression rewriting bug, since the filtered # table isn't found elsewhere in the expression. pred2 = t1.f > t1[t1.foo_id == 'foo'].f.mean() expr2 = t1[pred2] result = to_sql(expr) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT avg(`f`) AS `tmp` FROM star1 )""" assert result == expected result = to_sql(expr2) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT avg(`f`) AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected def test_filter_subquery_derived_reduction(self): t1 = self.con.table('star1') # Reduction can be nested inside some scalar expression pred3 = t1.f > t1[t1.foo_id == 'foo'].f.mean().log() pred4 = t1.f > (t1[t1.foo_id == 'foo'].f.mean().log() + 1) expr3 = t1[pred3] result = to_sql(expr3) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT ln(avg(`f`)) AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected expr4 = t1[pred4] result = to_sql(expr4) expected = """SELECT * FROM star1 WHERE `f` > ( SELECT ln(avg(`f`)) + 1 AS `tmp` FROM star1 WHERE `foo_id` = 'foo' )""" assert result == expected def test_topk_operation_to_semi_join(self): # TODO: top K with filter in place table = api.table([ ('foo', 'string'), ('bar', 'string'), ('city', 'string'), ('v1', 'double'), ('v2', 'double'), ], 'tbl') what = table.city.topk(10, by=table.v2.mean()) filtered = table[what] query = to_sql(filtered) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, avg(`v2`) AS `mean` FROM tbl GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected # Test the default metric (count) what = table.city.topk(10) filtered2 = table[what] query = to_sql(filtered2) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, count(`city`) AS `count` FROM tbl GROUP BY 1 ORDER BY `count` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected def test_topk_predicate_pushdown_bug(self): # Observed on TPCH data cplusgeo = ( customer.inner_join(nation, [customer.c_nationkey == nation.n_nationkey]) .inner_join(region, [nation.n_regionkey == region.r_regionkey]) [customer, nation.n_name, region.r_name]) pred = cplusgeo.n_name.topk(10, by=cplusgeo.c_acctbal.sum()) expr = cplusgeo.filter([pred]) result = to_sql(expr) expected = """\ SELECT t0.*, t1.`n_name`, t2.`r_name` FROM customer t0 INNER JOIN nation t1 ON t0.`c_nationkey` = t1.`n_nationkey` INNER JOIN region t2 ON t1.`n_regionkey` = t2.`r_regionkey` LEFT SEMI JOIN ( SELECT t1.`n_name`, sum(t0.`c_acctbal`) AS `sum` FROM customer t0 INNER JOIN nation t1 ON t0.`c_nationkey` = t1.`n_nationkey` INNER JOIN region t2 ON t1.`n_regionkey` = t2.`r_regionkey` GROUP BY 1 ORDER BY `sum` DESC LIMIT 10 ) t3 ON t1.`n_name` = t3.`n_name`""" assert result == expected def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) expr = t[delay_filter].group_by('origin').size() result = to_sql(expr) expected = """\ SELECT t0.`origin`, count(*) AS `count` FROM airlines t0 LEFT SEMI JOIN ( SELECT `dest`, avg(`arrdelay`) AS `mean` FROM airlines WHERE `dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`dest` = t1.`dest` WHERE t0.`dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1""" assert result == expected def test_topk_to_aggregate(self): t = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') top = t.dest.topk(10, by=t.arrdelay.mean()) result = to_sql(top) expected = to_sql(top.to_aggregation()) assert result == expected def test_bottomk(self): pass def test_topk_antijoin(self): # Get the "other" category somehow pass def test_case_in_projection(self): t = self.con.table('alltypes') expr = (t.g.case() .when('foo', 'bar') .when('baz', 'qux') .else_('default').end()) expr2 = (api.case() .when(t.g == 'foo', 'bar') .when(t.g == 'baz', t.g) .end()) proj = t[expr.name('col1'), expr2.name('col2'), t] result = to_sql(proj) expected = """SELECT CASE `g` WHEN 'foo' THEN 'bar' WHEN 'baz' THEN 'qux' ELSE 'default' END AS `col1`, CASE WHEN `g` = 'foo' THEN 'bar' WHEN `g` = 'baz' THEN `g` ELSE NULL END AS `col2`, * FROM alltypes""" assert result == expected def test_identifier_quoting(self): data = api.table([ ('date', 'int32'), ('explain', 'string') ], 'table') expr = data[data.date.name('else'), data.explain.name('join')] result = to_sql(expr) expected = """SELECT `date` AS `else`, `explain` AS `join` FROM `table`""" assert result == expected
class TestBuiltins(unittest.TestCase): def setUp(self): self.con = MockConnection() self.alltypes = self.con.table('functional_alltypes') self.lineitem = self.con.table('tpch_lineitem') def test_abs(self): colnames = [ 'tinyint_col', 'smallint_col', 'int_col', 'bigint_col', 'float_col', 'double_col' ] fname = 'abs' op = ops.Abs for col in colnames: expr = self.alltypes[col] self._check_unary_op(expr, fname, op, type(expr)) expr = self.lineitem.l_extendedprice self._check_unary_op(expr, fname, op, type(expr)) def test_group_concat(self): col = self.alltypes.string_col expr = col.group_concat() assert isinstance(expr.op(), ops.GroupConcat) arg, sep = expr.op().args assert sep == ',' expr = col.group_concat('|') arg, sep = expr.op().args assert sep == '|' def test_zeroifnull(self): dresult = self.alltypes.double_col.zeroifnull() iresult = self.alltypes.int_col.zeroifnull() assert type(dresult.op()) == ops.ZeroIfNull assert type(dresult) == ir.DoubleArray # Impala upconverts all ints to bigint. Hmm. assert type(iresult) == type(iresult) def test_fillna(self): result = self.alltypes.double_col.fillna(5) assert isinstance(result, ir.DoubleArray) assert isinstance(result.op(), ops.IfNull) result = self.alltypes.bool_col.fillna(True) assert isinstance(result, ir.BooleanArray) # Retains type of caller (for now) result = self.alltypes.int_col.fillna(self.alltypes.bigint_col) assert isinstance(result, ir.Int32Array) def test_ceil_floor(self): cresult = self.alltypes.double_col.ceil() fresult = self.alltypes.double_col.floor() assert isinstance(cresult, ir.Int64Array) assert isinstance(fresult, ir.Int64Array) assert type(cresult.op()) == ops.Ceil assert type(fresult.op()) == ops.Floor cresult = ibis.literal(1.2345).ceil() fresult = ibis.literal(1.2345).floor() assert isinstance(cresult, ir.Int64Scalar) assert isinstance(fresult, ir.Int64Scalar) dec_col = self.lineitem.l_extendedprice cresult = dec_col.ceil() fresult = dec_col.floor() assert isinstance(cresult, ir.DecimalArray) assert cresult.meta == dec_col.meta assert isinstance(fresult, ir.DecimalArray) assert fresult.meta == dec_col.meta def test_sign(self): result = self.alltypes.double_col.sign() assert isinstance(result, ir.FloatArray) assert type(result.op()) == ops.Sign result = ibis.literal(1.2345).sign() assert isinstance(result, ir.FloatScalar) dec_col = self.lineitem.l_extendedprice result = dec_col.sign() assert isinstance(result, ir.FloatArray) def test_round(self): result = self.alltypes.double_col.round() assert isinstance(result, ir.Int64Array) assert result.op().args[1] is None result = self.alltypes.double_col.round(2) assert isinstance(result, ir.DoubleArray) assert result.op().args[1] == 2 # Even integers are double (at least in Impala, check with other DB # implementations) result = self.alltypes.int_col.round(2) assert isinstance(result, ir.DoubleArray) dec = self.lineitem.l_extendedprice result = dec.round() assert isinstance(result, ir.DecimalArray) result = dec.round(2) assert isinstance(result, ir.DecimalArray) result = ibis.literal(1.2345).round() assert isinstance(result, ir.Int64Scalar) def _check_unary_op(self, expr, fname, ex_op, ex_type): result = getattr(expr, fname)() assert type(result.op()) == ex_op assert type(result) == ex_type