def test_table_names_overlap_default_aliases(self): # see discussion in #104; this actually is not needed for query # correctness, and only makes the generated SQL nicer raise unittest.SkipTest t0 = api.table([ ('key', 'string'), ('v1', 'double') ], 't1') t1 = api.table([ ('key', 'string'), ('v2', 'double') ], 't0') expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2] result = to_sql(expr) expected = """\ SELECT t2.`key`, t2.`v1`, t3.`v2` FROM t0 t2 INNER JOIN t1 t3 ON t2.`key` = t3.`key`""" assert result == expected
def test_projection_with_join_pushdown_rewrite_refs(self): # Observed this expression IR issue in a TopK-rewrite context table1 = api.table([('a_key1', 'string'), ('a_key2', 'string'), ('a_value', 'double')], 'foo') table2 = api.table([('b_key1', 'string'), ('b_name', 'string'), ('b_value', 'double')], 'bar') table3 = api.table([('c_key2', 'string'), ('c_name', 'string')], 'baz') proj = (table1.inner_join(table2, [('a_key1', 'b_key1')]).inner_join( table3, [(table1.a_key2, table3.c_key2)])[table1, table2.b_name.name('b'), table3.c_name.name('c'), table2.b_value]) cases = [(proj.a_value > 0, table1.a_value > 0), (proj.b_value > 0, table2.b_value > 0)] for higher_pred, lower_pred in cases: result = proj.filter([higher_pred]) op = result.op() assert isinstance(op, ops.Projection) filter_op = op.table.op() assert isinstance(filter_op, ops.Filter) new_pred = filter_op.predicates[0] assert_equal(new_pred, lower_pred)
def test_nameless_table(self): # Ensure that user gets some kind of sensible error nameless = api.table([('key', 'string')]) self.assertRaises(com.RelationError, to_sql, nameless) with_name = api.table([('key', 'string')], name='baz') result = to_sql(with_name) assert result == 'SELECT *\nFROM baz'
def test_projection_invalid_root(self): schema1 = {'foo': 'double', 'bar': 'int32'} left = api.table(schema1, name='foo') right = api.table(schema1, name='bar') exprs = [right['foo'], right['bar']] self.assertRaises(RelationError, left.projection, exprs)
def test_projection_invalid_root(table): schema1 = {'foo': 'double', 'bar': 'int32'} left = api.table(schema1, name='foo') right = api.table(schema1, name='bar') exprs = [right['foo'], right['bar']] with pytest.raises(RelationError): left.projection(exprs)
def setUp(self): self.foo = api.table([('job', 'string'), ('dept_id', 'string'), ('year', 'int32'), ('y', 'double')], 'foo') self.bar = api.table([('x', 'double'), ('job', 'string')], 'bar') self.t1 = api.table([('key1', 'string'), ('key2', 'string'), ('value1', 'double')], 'foo') self.t2 = api.table([('key1', 'string'), ('key2', 'string')], 'bar')
def test_projection_invalid_root(self): schema1 = { 'foo': 'double', 'bar': 'int32' } left = api.table(schema1, name='foo') right = api.table(schema1, name='bar') exprs = [right['foo'], right['bar']] self.assertRaises(RelationError, left.projection, exprs)
def test_join_between_joins(self): t1 = api.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double'), ], 'first') t2 = api.table([ ('key1', 'string'), ('value2', 'double'), ], 'second') t3 = api.table([ ('key2', 'string'), ('key3', 'string'), ('value3', 'double'), ], 'third') t4 = api.table([ ('key3', 'string'), ('value4', 'double') ], 'fourth') left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2] right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4] joined = left.inner_join(right, [('key2', 'key2')]) # At one point, the expression simplification was resulting in bad refs # here (right.value3 referencing the table inside the right join) exprs = [left, right.value3, right.value4] projected = joined.projection(exprs) result = to_sql(projected) expected = """SELECT t0.*, t1.`value3`, t1.`value4` FROM ( SELECT t2.*, t3.`value2` FROM `first` t2 INNER JOIN second t3 ON t2.`key1` = t3.`key1` ) t0 INNER JOIN ( SELECT t2.*, t3.`value4` FROM third t2 INNER JOIN fourth t3 ON t2.`key3` = t3.`key3` ) t1 ON t0.`key2` = t1.`key2`""" assert result == expected
def test_double_nested_subquery_no_aliases(self): # We don't require any table aliasing anywhere t = api.table([ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value', 'double') ], 'foo_table') agg1 = t.aggregate([t.value.sum().name('total')], by=['key1', 'key2', 'key3']) agg2 = agg1.aggregate([agg1.total.sum().name('total')], by=['key1', 'key2']) agg3 = agg2.aggregate([agg2.total.sum().name('total')], by=['key1']) result = to_sql(agg3) expected = """SELECT `key1`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total` FROM foo_table GROUP BY 1, 2, 3 ) t1 GROUP BY 1, 2 ) t0 GROUP BY 1""" assert result == expected
def test_join_between_joins(self): t1 = api.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double'), ], 'first') t2 = api.table([ ('key1', 'string'), ('value2', 'double'), ], 'second') t3 = api.table([ ('key2', 'string'), ('key3', 'string'), ('value3', 'double'), ], 'third') t4 = api.table([('key3', 'string'), ('value4', 'double')], 'fourth') left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2] right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4] joined = left.inner_join(right, [('key2', 'key2')]) # At one point, the expression simplification was resulting in bad refs # here (right.value3 referencing the table inside the right join) exprs = [left, right.value3, right.value4] projected = joined.projection(exprs) result = to_sql(projected) expected = """SELECT t0.*, t1.`value3`, t1.`value4` FROM ( SELECT t2.*, t3.`value2` FROM `first` t2 INNER JOIN second t3 ON t2.`key1` = t3.`key1` ) t0 INNER JOIN ( SELECT t2.*, t3.`value4` FROM third t2 INNER JOIN fourth t3 ON t2.`key3` = t3.`key3` ) t1 ON t0.`key2` = t1.`key2`""" assert result == expected
def test_replace_column(table): tb = api.table([('a', 'int32'), ('b', 'double'), ('c', 'string')]) expr = tb.b.cast('int32') tb2 = tb.set_column('b', expr) expected = tb[tb.a, expr.name('b'), tb.c] assert_equal(tb2, expected)
def setUp(self): self.table = api.table([('v1', 'decimal(12, 2)'), ('v2', 'decimal(10, 4)'), ('v3', 'int32'), ('v4', 'int64'), ('v5', 'float'), ('v6', 'double'), ('v7', 'string'), ('v8', 'boolean')], 'testing') self.functions = [api.coalesce, api.greatest, api.least]
def test_identifier_quoting(self): data = api.table([('date', 'int32'), ('explain', 'string')], 'table') expr = data[data.date.name('else'), data.explain.name('join')] result = to_sql(expr) expected = """SELECT `date` AS `else`, `explain` AS `join` FROM `table`""" assert result == expected
def test_table_names_overlap_default_aliases(self): # see discussion in #104; this actually is not needed for query # correctness, and only makes the generated SQL nicer raise unittest.SkipTest t0 = api.table([('key', 'string'), ('v1', 'double')], 't1') t1 = api.table([('key', 'string'), ('v2', 'double')], 't0') expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2] result = to_sql(expr) expected = """\ SELECT t2.`key`, t2.`v1`, t3.`v2` FROM t0 t2 INNER JOIN t1 t3 ON t2.`key` = t3.`key`""" assert result == expected
def test_no_aliases_needed(self): table = api.table([('key1', 'string'), ('key2', 'string'), ('value', 'double')]) expr = table.aggregate([table['value'].sum().name('total')], by=['key1', 'key2']) query = _get_query(expr) context = query.context assert not context.need_aliases()
def test_column_relabel(table): # GH #551. Keeping the test case very high level to not presume that # the relabel is necessarily implemented using a projection types = ['int32', 'string', 'double'] table = api.table(zip(['foo', 'bar', 'baz'], types)) result = table.relabel({'foo': 'one', 'baz': 'three'}) schema = result.schema() ex_schema = api.schema(zip(['one', 'bar', 'three'], types)) assert_equal(schema, ex_schema)
def test_column_relabel(self): # GH #551. Keeping the test case very high level to not presume that # the relabel is necessarily implemented using a projection types = ['int32', 'string', 'double'] table = api.table(zip(['foo', 'bar', 'baz'], types)) result = table.relabel({'foo': 'one', 'baz': 'three'}) schema = result.schema() ex_schema = api.schema(zip(['one', 'bar', 'three'], types)) assert_equal(schema, ex_schema)
def test_identifier_quoting(self): data = api.table([ ('date', 'int32'), ('explain', 'string') ], 'table') expr = data[data.date.name('else'), data.explain.name('join')] result = to_sql(expr) expected = """SELECT `date` AS `else`, `explain` AS `join` FROM `table`""" assert result == expected
def test_fuse_projections(self): table = api.table([ ('foo', 'int32'), ('bar', 'int64'), ('value', 'double') ], name='tbl') # Cases where we project in both cases using the base table reference f1 = (table['foo'] + table['bar']).name('baz') pred = table['value'] > 0 table2 = table[table, f1] table2_filtered = table2[pred] f2 = (table2['foo'] * 2).name('qux') f3 = (table['foo'] * 2).name('qux') table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl""" ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl WHERE `value` > 0""" table3_sql = to_sql(table3) table3_filt_sql = to_sql(table3_filtered) assert table3_sql == ex_sql assert table3_filt_sql == ex_sql2 # Use the intermediate table refs table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2)
def test_no_aliases_needed(self): table = api.table([ ('key1', 'string'), ('key2', 'string'), ('value', 'double') ]) expr = table.aggregate([table['value'].sum().name('total')], by=['key1', 'key2']) query = _get_query(expr) context = query.context assert not context.need_aliases()
def setUp(self): self.table = api.table([ ('v1', 'decimal(12, 2)'), ('v2', 'decimal(10, 4)'), ('v3', 'int32'), ('v4', 'int64'), ('v5', 'float'), ('v6', 'double'), ('v7', 'string'), ('v8', 'boolean') ], 'testing') self.functions = [api.coalesce, api.greatest, api.least]
def test_fuse_projections(self): table = api.table([('foo', 'int32'), ('bar', 'int64'), ('value', 'double')], name='tbl') # Cases where we project in both cases using the base table reference f1 = (table['foo'] + table['bar']).name('baz') pred = table['value'] > 0 table2 = table[table, f1] table2_filtered = table2[pred] f2 = (table2['foo'] * 2).name('qux') f3 = (table['foo'] * 2).name('qux') table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2) ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl""" ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux` FROM tbl WHERE `value` > 0""" table3_sql = to_sql(table3) table3_filt_sql = to_sql(table3_filtered) assert table3_sql == ex_sql assert table3_filt_sql == ex_sql2 # Use the intermediate table refs table3 = table2.projection([table2, f2]) # fusion works even if there's a filter table3_filtered = table2_filtered.projection([table2, f2]) expected = table[table, f1, f3] expected2 = table[pred][table, f1, f3] assert table3.equals(expected) assert table3_filtered.equals(expected2)
def test_projection_with_join_pushdown_rewrite_refs(self): # Observed this expression IR issue in a TopK-rewrite context table1 = api.table([ ('a_key1', 'string'), ('a_key2', 'string'), ('a_value', 'double') ], 'foo') table2 = api.table([ ('b_key1', 'string'), ('b_name', 'string'), ('b_value', 'double') ], 'bar') table3 = api.table([ ('c_key2', 'string'), ('c_name', 'string') ], 'baz') proj = (table1.inner_join(table2, [('a_key1', 'b_key1')]) .inner_join(table3, [(table1.a_key2, table3.c_key2)]) [table1, table2.b_name.name('b'), table3.c_name.name('c'), table2.b_value]) cases = [ (proj.a_value > 0, table1.a_value > 0), (proj.b_value > 0, table2.b_value > 0) ] for higher_pred, lower_pred in cases: result = proj.filter([higher_pred]) op = result.op() assert isinstance(op, ops.Projection) filter_op = op.table.op() assert isinstance(filter_op, ops.Filter) new_pred = filter_op.predicates[0] assert_equal(new_pred, lower_pred)
def setUp(self): self.foo = api.table( [ ('job', 'string'), ('dept_id', 'string'), ('year', 'int32'), ('y', 'double') ], 'foo') self.bar = api.table([ ('x', 'double'), ('job', 'string') ], 'bar') self.t1 = api.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double') ], 'foo') self.t2 = api.table([ ('key1', 'string'), ('key2', 'string') ], 'bar')
def test_topk_operation_to_semi_join(self): # TODO: top K with filter in place table = api.table([ ('foo', 'string'), ('bar', 'string'), ('city', 'string'), ('v1', 'double'), ('v2', 'double'), ], 'tbl') what = table.city.topk(10, by=table.v2.mean()) filtered = table[what] query = to_sql(filtered) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, avg(`v2`) AS `mean` FROM tbl GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected # Test the default metric (count) what = table.city.topk(10) filtered2 = table[what] query = to_sql(filtered2) expected = """SELECT t0.* FROM tbl t0 LEFT SEMI JOIN ( SELECT `city`, count(`city`) AS `count` FROM tbl GROUP BY 1 ORDER BY `count` DESC LIMIT 10 ) t1 ON t0.`city` = t1.`city`""" assert query == expected
def test_double_nested_subquery_no_aliases(self): # We don't require any table aliasing anywhere t = api.table([('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value', 'double')], 'foo_table') agg1 = t.aggregate([t.value.sum().name('total')], by=['key1', 'key2', 'key3']) agg2 = agg1.aggregate([agg1.total.sum().name('total')], by=['key1', 'key2']) agg3 = agg2.aggregate([agg2.total.sum().name('total')], by=['key1']) result = to_sql(agg3) expected = """SELECT `key1`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, sum(`total`) AS `total` FROM ( SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total` FROM foo_table GROUP BY 1, 2, 3 ) t1 GROUP BY 1, 2 ) t0 GROUP BY 1""" assert result == expected
def test_invalid_predicate(table, schema): # a lookalike table2 = api.table(schema, name='bar') predicate = table2.a > 5 with pytest.raises(RelationError): table.filter(predicate)
reduction = self.table.g.isnull().ifelse(1, 0).sum() result = to_sql(reduction) expected = """\ SELECT sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END) AS `tmp` FROM alltypes""" assert result == expected def _get_query(expr): ast = build_ast(expr) return ast.queries[0] nation = api.table([ ('n_regionkey', 'int32'), ('n_nationkey', 'int32'), ('n_name', 'string') ], 'nation') region = api.table([ ('r_regionkey', 'int32'), ('r_name', 'string') ], 'region') customer = api.table([ ('c_nationkey', 'int32'), ('c_name', 'string'), ('c_acctbal', 'double') ], 'customer')
# aggregation reduction = self.table.g.isnull().ifelse(1, 0).sum() result = to_sql(reduction) expected = """\ SELECT sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END) AS `tmp` FROM alltypes""" assert result == expected def _get_query(expr): ast = build_ast(expr) return ast.queries[0] nation = api.table([('n_regionkey', 'int32'), ('n_nationkey', 'int32'), ('n_name', 'string')], 'nation') region = api.table([('r_regionkey', 'int32'), ('r_name', 'string')], 'region') customer = api.table([('c_nationkey', 'int32'), ('c_name', 'string'), ('c_acctbal', 'double')], 'customer') class TestSelectSQL(unittest.TestCase): def setUp(self): self.con = MockConnection() def test_nameless_table(self): # Ensure that user gets some kind of sensible error nameless = api.table([('key', 'string')]) self.assertRaises(com.RelationError, to_sql, nameless)
def test_empty_schema(): table = api.table([], 'foo') assert len(table.schema()) == 0
def test_invalid_predicate(table, schema): # a lookalike table2 = api.table(schema, name='bar') with pytest.raises(RelationError): table[table2.a > 5]
def test_empty_schema(): table = api.table([], 'foo') assert not table.schema()
def test_invalid_predicate(self): # a lookalike table2 = api.table(self.schema, name='bar') self.assertRaises(RelationError, self.table.__getitem__, table2['a'] > 5)
def test_empty_schema(self): table = api.table([], 'foo') assert len(table.schema()) == 0