Пример #1
0
class TestInteractiveUse(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_interactive_execute_on_repr(self):
        table = self.con.table('functional_alltypes')
        expr = table.bigint_col.sum()
        with config.option_context('interactive', True):
            repr(expr)

        assert len(self.con.executed_queries) > 0

    def test_default_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            repr(table)

        expected = """\
SELECT *
FROM functional_alltypes
LIMIT {0}""".format(config.options.sql.default_limit)

        assert self.con.executed_queries[0] == expected

    def test_disable_query_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            with config.option_context('sql.default_limit', None):
                repr(table)

        expected = """\
SELECT *
FROM functional_alltypes"""

        assert self.con.executed_queries[0] == expected

    def test_interactive_non_compilable_repr_not_fail(self):
        # #170
        table = self.con.table('functional_alltypes')

        expr = table.string_col.topk(3)

        # it works!
        with config.option_context('interactive', True):
            repr(expr)

    def test_histogram_repr_no_query_execute(self):
        t = self.con.table('functional_alltypes')
        tier = t.double_col.histogram(10).name('bucket')
        expr = t.group_by(tier).size()
        with config.option_context('interactive', True):
            expr._repr()
        assert self.con.executed_queries == []
Пример #2
0
class TestInteractiveUse(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_interactive_execute_on_repr(self):
        table = self.con.table('functional_alltypes')
        expr = table.bigint_col.sum()
        with config.option_context('interactive', True):
            repr(expr)

        assert len(self.con.executed_queries) > 0

    def test_default_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            repr(table)

        expected = """\
SELECT *
FROM functional_alltypes
LIMIT {0}""".format(config.options.sql.default_limit)

        assert self.con.executed_queries[0] == expected

    def test_disable_query_limit(self):
        table = self.con.table('functional_alltypes')

        with config.option_context('interactive', True):
            with config.option_context('sql.default_limit', None):
                repr(table)

        expected = """\
SELECT *
FROM functional_alltypes"""

        assert self.con.executed_queries[0] == expected

    def test_interactive_non_compilable_repr_not_fail(self):
        # #170
        table = self.con.table('functional_alltypes')

        expr = table.string_col.topk(3)

        # it works!
        with config.option_context('interactive', True):
            repr(expr)

    def test_histogram_repr_no_query_execute(self):
        t = self.con.table('functional_alltypes')
        tier = t.double_col.histogram(10).name('bucket')
        expr = t.group_by(tier).size()
        with config.option_context('interactive', True):
            expr._repr()
        assert self.con.executed_queries == []
Пример #3
0
class TestDistinct(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_simple_table_distinct(self):
        t = self.con.table('functional_alltypes')

        expr = t[t.string_col, t.int_col].distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`, `int_col`
FROM functional_alltypes"""
        assert result == expected

    def test_array_distinct(self):
        t = self.con.table('functional_alltypes')
        expr = t.string_col.distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`
FROM functional_alltypes"""
        assert result == expected

    def test_count_distinct(self):
        t = self.con.table('functional_alltypes')

        metric = t.int_col.nunique().name('nunique')
        expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric])

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique`
FROM functional_alltypes
WHERE `bigint_col` > 0
GROUP BY 1"""
        assert result == expected

    def test_multiple_count_distinct(self):
        # Impala and some other databases will not execute multiple
        # count-distincts in a single aggregation query. This error reporting
        # will be left to the database itself, for now.
        t = self.con.table('functional_alltypes')
        metrics = [
            t.int_col.nunique().name('int_card'),
            t.smallint_col.nunique().name('smallint_card')
        ]

        expr = t.group_by('string_col').aggregate(metrics)

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`,
       COUNT(DISTINCT `smallint_col`) AS `smallint_card`
FROM functional_alltypes
GROUP BY 1"""
        assert result == expected
Пример #4
0
class TestDistinct(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_simple_table_distinct(self):
        t = self.con.table('functional_alltypes')

        expr = t[t.string_col, t.int_col].distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`, `int_col`
FROM functional_alltypes"""
        assert result == expected

    def test_array_distinct(self):
        t = self.con.table('functional_alltypes')
        expr = t.string_col.distinct()

        result = to_sql(expr)
        expected = """SELECT DISTINCT `string_col`
FROM functional_alltypes"""
        assert result == expected

    def test_count_distinct(self):
        t = self.con.table('functional_alltypes')

        metric = t.int_col.nunique().name('nunique')
        expr = t[t.bigint_col > 0].group_by('string_col').aggregate([metric])

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `nunique`
FROM functional_alltypes
WHERE `bigint_col` > 0
GROUP BY 1"""
        assert result == expected

    def test_multiple_count_distinct(self):
        # Impala and some other databases will not execute multiple
        # count-distincts in a single aggregation query. This error reporting
        # will be left to the database itself, for now.
        t = self.con.table('functional_alltypes')
        metrics = [t.int_col.nunique().name('int_card'),
                   t.smallint_col.nunique().name('smallint_card')]

        expr = t.group_by('string_col').aggregate(metrics)

        result = to_sql(expr)
        expected = """SELECT `string_col`, COUNT(DISTINCT `int_col`) AS `int_card`,
       COUNT(DISTINCT `smallint_col`) AS `smallint_card`
FROM functional_alltypes
GROUP BY 1"""
        assert result == expected
Пример #5
0
class TestAnalytics(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')

    def test_category_project(self):
        t = self.alltypes

        tier = t.double_col.bucket([0, 50, 100]).name('tier')
        expr = t[tier, t]

        assert isinstance(expr.tier, ir.CategoryArray)

    def test_bucket(self):
        d = self.alltypes.double_col
        bins = [0, 10, 50, 100]

        expr = d.bucket(bins)
        assert isinstance(expr, ir.CategoryArray)
        assert expr.op().nbuckets == 3

        expr = d.bucket(bins, include_over=True)
        assert expr.op().nbuckets == 4

        expr = d.bucket(bins, include_over=True, include_under=True)
        assert expr.op().nbuckets == 5

    def test_bucket_error_cases(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.bucket, [])
        self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo')

        # it works!
        d.bucket([10], include_under=True, include_over=True)

        self.assertRaises(ValueError, d.bucket, [10])
        self.assertRaises(ValueError, d.bucket, [10], include_under=True)
        self.assertRaises(ValueError, d.bucket, [10], include_over=True)

    def test_histogram(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5)
        self.assertRaises(ValueError, d.histogram)
        self.assertRaises(ValueError, d.histogram, 10, closed='foo')

    def test_topk_analysis_bug(self):
        # GH #398
        airlines = ibis.table([('dest', 'string'), ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
        filtered = t.filter([delay_filter])

        # predicate is unmodified by analysis
        post_pred = filtered.op().predicates[1]
        assert delay_filter.equals(post_pred)
Пример #6
0
    def test_ctas_ddl(self):
        con = MockConnection()

        select = build_ast(con.table('test1')).queries[0]
        statement = ksupport.CTASKudu(
            'another_table',
            'kudu_name',
            ['dom.d.com:7051'],
            select,
            ['string_col'],
            external=True,
            can_exist=False,
            database='foo',
        )
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
TBLPROPERTIES (
  'kudu.key_columns'='string_col',
  'kudu.master_addresses'='dom.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
) AS
SELECT *
FROM test1"""
        assert result == expected
Пример #7
0
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_coalesce(self):
        t = self.table
        cases = [
            (ibis.coalesce(t.string_col, 'foo'),
             "coalesce(`string_col`, 'foo')"),
            (ibis.coalesce(t.int_col, t.bigint_col),
             'coalesce(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_greatest(self):
        t = self.table
        cases = [
            (ibis.greatest(t.string_col, 'foo'),
             "greatest(`string_col`, 'foo')"),
            (ibis.greatest(t.int_col, t.bigint_col),
             'greatest(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_least(self):
        t = self.table
        cases = [
            (ibis.least(t.string_col, 'foo'),
             "least(`string_col`, 'foo')"),
            (ibis.least(t.int_col, t.bigint_col),
             'least(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)
Пример #8
0
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_analytic_exprs(self):
        t = self.table

        w = ibis.window(order_by=t.float_col)

        cases = [
            (ibis.row_number().over(w),
             '(row_number() OVER (ORDER BY `float_col`) - 1)'),
            (t.string_col.lag(), 'lag(`string_col`)'),
            (t.string_col.lag(2), 'lag(`string_col`, 2)'),
            (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'),
            (t.string_col.lead(), 'lead(`string_col`)'),
            (t.string_col.lead(2), 'lead(`string_col`, 2)'),
            (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'),
            (t.double_col.first(), 'first_value(`double_col`)'),
            (t.double_col.last(), 'last_value(`double_col`)'),
            # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))')
        ]
        self._check_expr_cases(cases)
Пример #9
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("alltypes")

    def test_field_in_literals(self):
        cases = [
            (self.table.g.isin(["foo", "bar", "baz"]), "`g` IN ('foo', 'bar', 'baz')"),
            (self.table.g.notin(["foo", "bar", "baz"]), "`g` NOT IN ('foo', 'bar', 'baz')"),
        ]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (L(2).isin([self.table.a, self.table.b, self.table.c]), "2 IN (`a`, `b`, `c`)"),
            (L(2).notin([self.table.a, self.table.b, self.table.c]), "2 NOT IN (`a`, `b`, `c`)"),
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        filtered = self.table[self.table.g.isin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` IN ('foo', 'bar')"""
        assert result == expected

        filtered = self.table[self.table.g.notin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` NOT IN ('foo', 'bar')"""
        assert result == expected
Пример #10
0
class TestUnions(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

        table = self.con.table('functional_alltypes')

        self.t1 = (table[table.int_col > 0][
            table.string_col.name('key'),
            table.float_col.cast('double').name('value')])
        self.t2 = (table[table.int_col <= 0][table.string_col.name('key'),
                                             table.double_col.name('value')])

        self.union1 = self.t1.union(self.t2)

    def test_union(self):
        result = to_sql(self.union1)
        expected = """\
SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
FROM functional_alltypes
WHERE `int_col` > 0
UNION ALL
SELECT `string_col` AS `key`, `double_col` AS `value`
FROM functional_alltypes
WHERE `int_col` <= 0"""
        assert result == expected

    def test_union_distinct(self):
        union = self.t1.union(self.t2, distinct=True)
        result = to_sql(union)
        expected = """\
SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
FROM functional_alltypes
WHERE `int_col` > 0
UNION
SELECT `string_col` AS `key`, `double_col` AS `value`
FROM functional_alltypes
WHERE `int_col` <= 0"""
        assert result == expected

    def test_union_project_column(self):
        # select a column, get a subquery
        expr = self.union1[[self.union1.key]]
        result = to_sql(expr)
        expected = """SELECT `key`
FROM (
  SELECT `string_col` AS `key`, CAST(`float_col` AS double) AS `value`
  FROM functional_alltypes
  WHERE `int_col` > 0
  UNION ALL
  SELECT `string_col` AS `key`, `double_col` AS `value`
  FROM functional_alltypes
  WHERE `int_col` <= 0
) t0"""
        assert result == expected

    def test_union_extract_with_block(self):
        pass

    def test_union_in_subquery(self):
        pass
Пример #11
0
    def test_ctas_ddl(self):
        con = MockConnection()

        select = build_ast(con.table('test1')).queries[0]
        statement = ksupport.CTASKudu(
            'another_table',
            'kudu_name',
            ['dom.d.com:7051'],
            select,
            ['string_col'],
            external=True,
            can_exist=False,
            database='foo',
        )
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
TBLPROPERTIES (
  'kudu.key_columns'='string_col',
  'kudu.master_addresses'='dom.d.com:7051',
  'kudu.table_name'='kudu_name',
  'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler'
) AS
SELECT *
FROM test1"""
        assert result == expected
Пример #12
0
class TestCoalesceGreaterLeast(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_coalesce(self):
        t = self.table
        cases = [
            (ibis.coalesce(t.string_col, 'foo'),
             "coalesce(`string_col`, 'foo')"),
            (ibis.coalesce(t.int_col, t.bigint_col),
             'coalesce(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_greatest(self):
        t = self.table
        cases = [
            (ibis.greatest(t.string_col, 'foo'),
             "greatest(`string_col`, 'foo')"),
            (ibis.greatest(t.int_col, t.bigint_col),
             'greatest(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)

    def test_least(self):
        t = self.table
        cases = [
            (ibis.least(t.string_col, 'foo'),
             "least(`string_col`, 'foo')"),
            (ibis.least(t.int_col, t.bigint_col),
             'least(`int_col`, `bigint_col`)'),
        ]
        self._check_expr_cases(cases)
Пример #13
0
class TestAnalyticFunctions(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_analytic_exprs(self):
        t = self.table

        w = ibis.window(order_by=t.float_col)

        cases = [
            (ibis.row_number().over(w),
             'row_number() OVER (ORDER BY `float_col`) - 1'),
            (t.string_col.lag(), 'lag(`string_col`)'),
            (t.string_col.lag(2), 'lag(`string_col`, 2)'),
            (t.string_col.lag(default=0), 'lag(`string_col`, 1, 0)'),
            (t.string_col.lead(), 'lead(`string_col`)'),
            (t.string_col.lead(2), 'lead(`string_col`, 2)'),
            (t.string_col.lead(default=0), 'lead(`string_col`, 1, 0)'),
            (t.double_col.first(), 'first_value(`double_col`)'),
            (t.double_col.last(), 'last_value(`double_col`)'),
            # (t.double_col.nth(4), 'first_value(lag(double_col, 4 - 1))')
        ]
        self._check_expr_cases(cases)
Пример #14
0
class TestInsert(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Пример #15
0
class TestInsert(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Пример #16
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_field_in_literals(self):
        cases = [(self.table.g.isin(["foo", "bar",
                                     "baz"]), "g IN ('foo', 'bar', 'baz')"),
                 (self.table.g.notin(["foo", "bar", "baz"]),
                  "g NOT IN ('foo', 'bar', 'baz')")]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (ibis.literal(2).isin([self.table.a, self.table.b,
                                   self.table.c]), '2 IN (a, b, c)'),
            (ibis.literal(2).notin([self.table.a, self.table.b,
                                    self.table.c]), '2 NOT IN (a, b, c)')
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        filtered = self.table[self.table.g.isin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE g IN ('foo', 'bar')"""
        assert result == expected

        filtered = self.table[self.table.g.notin(["foo", "bar"])]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE g NOT IN ('foo', 'bar')"""
        assert result == expected
Пример #17
0
class TestDistinct(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_distinct_basic(self):
        expr = self.table.distinct()
        assert isinstance(expr.op(), ops.Distinct)
        assert isinstance(expr, ir.TableExpr)
        assert expr.op().table is self.table

        expr = self.table.string_col.distinct()
        assert isinstance(expr.op(), ops.DistinctArray)
        assert isinstance(expr, ir.StringArray)

    # def test_distinct_array_interactions(self):
    # TODO

    # array cardinalities / shapes are likely to be different.
    #     a = self.table.int_col.distinct()
    #     b = self.table.bigint_col

    #     self.assertRaises(ir.RelationError, a.__add__, b)

    def test_distinct_count(self):
        result = self.table.string_col.distinct().count()
        expected = self.table.string_col.nunique().name('count')
        assert_equal(result, expected)
        assert isinstance(result.op(), ops.CountDistinct)

    def test_distinct_unnamed_array_expr(self):
        table = ibis.table([('year', 'int32'),
                            ('month', 'int32'),
                            ('day', 'int32')], 'foo')

        # it works!
        expr = (ibis.literal('-')
                .join([table.year.cast('string'),
                       table.month.cast('string'),
                       table.day.cast('string')])
                .distinct())
        repr(expr)

    def test_distinct_count_numeric_types(self):
        table = self.table
        metric = (table.bigint_col.distinct().count()
                  .name('unique_bigints'))

        table.group_by('string_col').aggregate(metric)

    def test_nunique(self):
        expr = self.table.string_col.nunique()
        assert isinstance(expr.op(), ops.CountDistinct)

    def test_project_with_distinct(self):
        pass
Пример #18
0
    def test_memoize_database_table(self):
        con = MockConnection()
        table = con.table('test1')
        table2 = con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1
Пример #19
0
    def test_memoize_database_table(self):
        con = MockConnection()
        table = con.table('test1')
        table2 = con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1
Пример #20
0
class TestInNotIn(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_field_in_literals(self):
        values = ['foo', 'bar', 'baz']
        values_formatted = tuple(set(values))
        cases = [
            (self.table.g.isin(values),
             "`g` IN {}".format(values_formatted)),
            (self.table.g.notin(values),
             "`g` NOT IN {}".format(values_formatted))
        ]
        self._check_expr_cases(cases)

    def test_literal_in_list(self):
        cases = [
            (L(2).isin([self.table.a, self.table.b, self.table.c]),
             '2 IN (`a`, `b`, `c`)'),
            (L(2).notin([self.table.a, self.table.b, self.table.c]),
             '2 NOT IN (`a`, `b`, `c`)')
        ]
        self._check_expr_cases(cases)

    def test_isin_notin_in_select(self):
        values = ['foo', 'bar']
        values_formatted = tuple(set(values))

        filtered = self.table[self.table.g.isin(values)]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` IN {}"""
        assert result == expected.format(values_formatted)

        filtered = self.table[self.table.g.notin(values)]
        result = to_sql(filtered)
        expected = """SELECT *
FROM alltypes
WHERE `g` NOT IN {}"""
        assert result == expected.format(values_formatted)
Пример #21
0
class TestStringOps(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_lower_upper(self):
        lresult = self.table.g.lower()
        uresult = self.table.g.upper()

        assert isinstance(lresult, ir.StringArray)
        assert isinstance(uresult, ir.StringArray)

        assert isinstance(lresult.op(), ops.Lowercase)
        assert isinstance(uresult.op(), ops.Uppercase)

        lit = literal('FoO')

        lresult = lit.lower()
        uresult = lit.upper()
        assert isinstance(lresult, ir.StringScalar)
        assert isinstance(uresult, ir.StringScalar)

    def test_substr(self):
        lit = literal('FoO')

        result = self.table.g.substr(2, 4)
        lit_result = lit.substr(0, 2)

        assert isinstance(result, ir.StringArray)
        assert isinstance(lit_result, ir.StringScalar)

        op = result.op()
        assert isinstance(op, ops.Substring)

        start, length = op.args[1:]

        assert start.equals(literal(2))
        assert length.equals(literal(4))

    def test_left_right(self):
        result = self.table.g.left(5)
        expected = self.table.g.substr(0, 5)
        assert result.equals(expected)

        result = self.table.g.right(5)
        op = result.op()
        assert isinstance(op, ops.StrRight)
        assert op.args[1].equals(literal(5))

    def test_length(self):
        lit = literal('FoO')
        result = self.table.g.length()
        lit_result = lit.length()

        assert isinstance(result, ir.Int32Array)
        assert isinstance(lit_result, ir.Int32Scalar)
        assert isinstance(result.op(), ops.StringLength)

    def test_join(self):
        dash = literal('-')

        expr = dash.join([self.table.f.cast('string'), self.table.g])
        assert isinstance(expr, ir.StringArray)

        expr = dash.join([literal('ab'), literal('cd')])
        assert isinstance(expr, ir.StringScalar)

    def test_contains(self):
        expr = self.table.g.contains('foo')
        expected = self.table.g.like('%foo%')
        assert_equal(expr, expected)

        self.assertRaises(Exception, lambda: 'foo' in self.table.g)

    def test_getitem_slice(self):
        cases = [
            (self.table.g[:3], self.table.g.substr(0, 3)),
            (self.table.g[2:6], self.table.g.substr(2, 4)),
        ]

        for case, expected in cases:
            assert_equal(case, expected)
Пример #22
0
class TestWrapping(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d,
                         self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        func = api.scalar_function(['string'], 'string', name='Tester')
        func.register('identity', 'udf_testing')

        result = func('hello world')
        assert (ibis.impala.compile(result) ==
                "SELECT udf_testing.identity('hello world') AS `tmp`")

    def test_sql_generation_from_infoclass(self):
        func = api.wrap_udf('test.so', ['string'], 'string', 'info_test')
        repr(func)

        func.register('info_test', 'udf_testing')
        result = func('hello world')
        assert (ibis.impala.compile(result) ==
                "SELECT udf_testing.info_test('hello world') AS `tmp`")

    def test_udf_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_udf([t], t, 'test')

            ibis_type = validate_type(t)

            expr = func(sv)
            assert type(expr) == type(ibis_type.scalar_type()(expr.op()))  # noqa: E501, E721
            expr = func(av)
            assert type(expr) == type(ibis_type.array_type()(expr.op()))  # noqa: E501, E721

    def test_uda_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_uda([t], t, 'test')

            ibis_type = validate_type(t)

            expr1 = func(sv)
            expr2 = func(sv)
            expected_type1 = type(ibis_type.scalar_type()(expr1.op()))
            expected_type2 = type(ibis_type.scalar_type()(expr2.op()))
            assert isinstance(expr1, expected_type1)
            assert isinstance(expr2, expected_type2)

    def test_decimal(self):
        func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test')
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalColumn

    def test_udf_invalid_typecasting(self):
        cases = [
            ('int8', self.all_cols[:1], self.all_cols[1:]),
            ('int16', self.all_cols[:2], self.all_cols[2:]),
            ('int32', self.all_cols[:3], self.all_cols[3:]),
            ('int64', self.all_cols[:4], self.all_cols[4:]),
            ('boolean', [], self.all_cols[:8] + self.all_cols[9:]),

            # allowing double here for now
            ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),

            ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),
            ('string', [], self.all_cols[:7] + self.all_cols[8:]),
            ('timestamp', [], self.all_cols[:-1]),
            ('decimal', [], self.all_cols[:4] + self.all_cols[7:])
        ]

        for t, valid_casts, invalid_casts in cases:
            func = self._register_udf([t], 'int32', 'typecast')

            for expr in valid_casts:
                func(expr)

            for expr in invalid_casts:
                self.assertRaises(IbisTypeError, func, expr)

    def test_mult_args(self):
        func = self._register_udf(['int32', 'double', 'string',
                                   'boolean', 'timestamp'],
                                  'int64', 'mult_types')

        expr = func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ColumnExpr)

        expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10'))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _register_udf(self, inputs, output, name):
        func = api.scalar_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func

    def _register_uda(self, inputs, output, name):
        func = api.aggregate_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func
Пример #23
0
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("functional_alltypes")

    def test_numeric_unary_builtins(self):
        # No argument functions
        functions = [
            "abs",
            "ceil",
            "floor",
            "exp",
            "sqrt",
            "sign",
            ("log", "ln"),
            ("approx_median", "appx_median"),
            ("approx_nunique", "ndv"),
            "ln",
            "log2",
            "log10",
            "nullifzero",
            "zeroifnull",
        ]

        cases = []
        for what in functions:
            if isinstance(what, tuple):
                ibis_name, sql_name = what
            else:
                ibis_name = sql_name = what

            for cname in ["double_col", "int_col"]:
                expr = getattr(self.table[cname], ibis_name)()
                cases.append((expr, "{0}({1})".format(sql_name, "`{0}`".format(cname))))

        self._check_expr_cases(cases)

    def test_log_other_bases(self):
        cases = [(self.table.double_col.log(5), "log(`double_col`, 5)")]
        self._check_expr_cases(cases)

    def test_round(self):
        cases = [
            (self.table.double_col.round(), "round(`double_col`)"),
            (self.table.double_col.round(0), "round(`double_col`, 0)"),
            (self.table.double_col.round(2), "round(`double_col`, 2)"),
            (self.table.double_col.round(self.table.tinyint_col), "round(`double_col`, `tinyint_col`)"),
        ]
        self._check_expr_cases(cases)

    def test_hash(self):
        expr = self.table.int_col.hash()
        assert isinstance(expr, ir.Int64Array)
        assert isinstance(self.table.int_col.sum().hash(), ir.Int64Scalar)

        cases = [(self.table.int_col.hash(), "fnv_hash(`int_col`)")]
        self._check_expr_cases(cases)

    def test_reduction_where(self):
        cond = self.table.bigint_col < 70
        c = self.table.double_col
        tmp = "{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` " "ELSE NULL END)"
        cases = [
            (c.sum(where=cond), tmp.format("sum")),
            (c.count(where=cond), tmp.format("count")),
            (c.mean(where=cond), tmp.format("avg")),
            (c.max(where=cond), tmp.format("max")),
            (c.min(where=cond), tmp.format("min")),
            (c.std(where=cond), tmp.format("stddev")),
            (c.std(where=cond, how="pop"), tmp.format("stddev_pop")),
            (c.var(where=cond), tmp.format("variance")),
            (c.var(where=cond, how="pop"), tmp.format("variance_pop")),
        ]
        self._check_expr_cases(cases)

    def test_reduction_invalid_where(self):
        condbad_literal = L("T")
        c = self.table.double_col
        for reduction in [c.sum, c.count, c.mean, c.max, c.min]:
            with self.assertRaises(TypeError):
                reduction(where=condbad_literal)
Пример #24
0
class TestBucketHistogram(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_bucket_to_case(self):
        buckets = [0, 10, 25, 50]

        expr1 = self.table.f.bucket(buckets)
        expected1 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr2 = self.table.f.bucket(buckets, close_extreme=False)
        expected2 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` < 50) THEN 2
  ELSE NULL
END"""

        expr3 = self.table.f.bucket(buckets, closed='right')
        expected3 = """\
CASE
  WHEN (`f` >= 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr4 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False)
        expected4 = """\
CASE
  WHEN (`f` > 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr5 = self.table.f.bucket(buckets, include_under=True)
        expected5 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr6 = self.table.f.bucket(buckets,
                                    include_under=True,
                                    include_over=True)
        expected6 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  WHEN `f` > 50 THEN 4
  ELSE NULL
END"""

        expr7 = self.table.f.bucket(buckets,
                                    close_extreme=False,
                                    include_under=True,
                                    include_over=True)
        expected7 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` < 50) THEN 3
  WHEN `f` >= 50 THEN 4
  ELSE NULL
END"""

        expr8 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False,
                                    include_under=True)
        expected8 = """\
CASE
  WHEN `f` <= 0 THEN 0
  WHEN (`f` > 0) AND (`f` <= 10) THEN 1
  WHEN (`f` > 10) AND (`f` <= 25) THEN 2
  WHEN (`f` > 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr9 = self.table.f.bucket([10], closed='right',
                                    include_over=True,
                                    include_under=True)
        expected9 = """\
CASE
  WHEN `f` <= 10 THEN 0
  WHEN `f` > 10 THEN 1
  ELSE NULL
END"""

        expr10 = self.table.f.bucket([10], include_over=True,
                                     include_under=True)
        expected10 = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        cases = [
            (expr1, expected1),
            (expr2, expected2),
            (expr3, expected3),
            (expr4, expected4),
            (expr5, expected5),
            (expr6, expected6),
            (expr7, expected7),
            (expr8, expected8),
            (expr9, expected9),
            (expr10, expected10),
        ]
        self._check_expr_cases(cases)

    def test_cast_category_to_int_noop(self):
        # Because the bucket result is an integer, no explicit cast is
        # necessary
        expr = (self.table.f.bucket([10], include_over=True,
                                    include_under=True)
                .cast('int32'))

        expected = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        expr2 = (self.table.f.bucket([10], include_over=True,
                                     include_under=True)
                 .cast('double'))

        expected2 = """\
CAST(CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END AS double)"""

        self._check_expr_cases([(expr, expected),
                                (expr2, expected2)])

    def test_bucket_assign_labels(self):
        buckets = [0, 10, 25, 50]
        bucket = self.table.f.bucket(buckets, include_under=True)

        size = self.table.group_by(bucket.name('tier')).size()
        labelled = size.tier.label(['Under 0', '0 to 10',
                                    '10 to 25', '25 to 50'],
                                   nulls='error').name('tier2')
        expr = size[labelled, size['count']]

        expected = """\
SELECT
  CASE `tier`
    WHEN 0 THEN 'Under 0'
    WHEN 1 THEN '0 to 10'
    WHEN 2 THEN '10 to 25'
    WHEN 3 THEN '25 to 50'
    ELSE 'error'
  END AS `tier2`, `count`
FROM (
  SELECT
    CASE
      WHEN `f` < 0 THEN 0
      WHEN (`f` >= 0) AND (`f` < 10) THEN 1
      WHEN (`f` >= 10) AND (`f` < 25) THEN 2
      WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
      ELSE NULL
    END AS `tier`, count(*) AS `count`
  FROM alltypes
  GROUP BY 1
) t0"""

        result = to_sql(expr)

        assert result == expected

        self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c'])
        self.assertRaises(ValueError, size.tier.label,
                          ['a', 'b', 'c', 'd', 'e'])
Пример #25
0
class TestSelectSQL(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_nameless_table(self):
        # Ensure that user gets some kind of sensible error
        nameless = api.table([('key', 'string')])
        self.assertRaises(com.RelationError, to_sql, nameless)

        with_name = api.table([('key', 'string')], name='baz')
        result = to_sql(with_name)
        assert result == 'SELECT *\nFROM baz'

    def test_physical_table_reference_translate(self):
        # If an expression's table leaves all reference database tables, verify
        # we translate correctly
        table = self.con.table('alltypes')

        query = _get_query(table)
        sql_string = query.compile()
        expected = "SELECT *\nFROM alltypes"
        assert sql_string == expected

    def test_simple_join_formatting(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        pred = t1['foo_id'] == t2['foo_id']
        pred2 = t1['bar_id'] == t2['foo_id']
        cases = [
            (t1.inner_join(t2, [pred])[[t1]], """SELECT t0.*
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            (t1.left_join(t2, [pred])[[t1]], """SELECT t0.*
FROM star1 t0
  LEFT OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            (t1.outer_join(t2, [pred])[[t1]], """SELECT t0.*
FROM star1 t0
  FULL OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            # multiple predicates
            (t1.inner_join(t2, [pred, pred2])[[t1]], """SELECT t0.*
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id` AND
       t0.`bar_id` = t1.`foo_id`"""),
        ]

        for expr, expected_sql in cases:
            result_sql = to_sql(expr)
            assert result_sql == expected_sql

    def test_multiple_join_cases(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')
        t3 = self.con.table('star3')

        predA = t1['foo_id'] == t2['foo_id']
        predB = t1['bar_id'] == t3['bar_id']

        what = (t1.left_join(t2, [predA]).inner_join(t3, [predB]).projection(
            [t1, t2['value1'], t3['value2']]))
        result_sql = to_sql(what)
        expected_sql = """SELECT t0.*, t1.`value1`, t2.`value2`
FROM star1 t0
  LEFT OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`
  INNER JOIN star3 t2
    ON t0.`bar_id` = t2.`bar_id`"""
        assert result_sql == expected_sql

    def test_join_between_joins(self):
        t1 = api.table([
            ('key1', 'string'),
            ('key2', 'string'),
            ('value1', 'double'),
        ], 'first')

        t2 = api.table([
            ('key1', 'string'),
            ('value2', 'double'),
        ], 'second')

        t3 = api.table([
            ('key2', 'string'),
            ('key3', 'string'),
            ('value3', 'double'),
        ], 'third')

        t4 = api.table([('key3', 'string'), ('value4', 'double')], 'fourth')

        left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2]
        right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4]

        joined = left.inner_join(right, [('key2', 'key2')])

        # At one point, the expression simplification was resulting in bad refs
        # here (right.value3 referencing the table inside the right join)
        exprs = [left, right.value3, right.value4]
        projected = joined.projection(exprs)

        result = to_sql(projected)
        expected = """SELECT t0.*, t1.`value3`, t1.`value4`
FROM (
  SELECT t2.*, t3.`value2`
  FROM `first` t2
    INNER JOIN second t3
      ON t2.`key1` = t3.`key1`
) t0
  INNER JOIN (
    SELECT t2.*, t3.`value4`
    FROM third t2
      INNER JOIN fourth t3
        ON t2.`key3` = t3.`key3`
  ) t1
    ON t0.`key2` = t1.`key2`"""
        assert result == expected

    def test_join_just_materialized(self):
        t1 = self.con.table('tpch_nation')
        t2 = self.con.table('tpch_region')
        t3 = self.con.table('tpch_customer')

        # GH #491
        joined = (t1.inner_join(t2,
                                t1.n_regionkey == t2.r_regionkey).inner_join(
                                    t3, t1.n_nationkey == t3.c_nationkey))
        result = to_sql(joined)
        expected = """SELECT *
FROM tpch_nation t0
  INNER JOIN tpch_region t1
    ON t0.`n_regionkey` = t1.`r_regionkey`
  INNER JOIN tpch_customer t2
    ON t0.`n_nationkey` = t2.`c_nationkey`"""
        assert result == expected

        result = to_sql(joined.materialize())
        assert result == expected

    def test_join_no_predicates_for_impala(self):
        # Impala requires that joins without predicates be written explicitly
        # as CROSS JOIN, since result sets can accidentally get too large if a
        # query is executed before predicates are written
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined2 = t1.cross_join(t2)[[t1]]

        expected = """SELECT t0.*
FROM star1 t0
  CROSS JOIN star2 t1"""
        result2 = to_sql(joined2)
        assert result2 == expected

        for jtype in ['inner_join', 'left_join', 'outer_join']:
            joined = getattr(t1, jtype)(t2)[[t1]]

            result = to_sql(joined)
            assert result == expected

    def test_semi_anti_joins(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined = t1.semi_join(t2, [t1.foo_id == t2.foo_id])[[t1]]

        result = to_sql(joined)
        expected = """SELECT t0.*
FROM star1 t0
  LEFT SEMI JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

        joined = t1.anti_join(t2, [t1.foo_id == t2.foo_id])[[t1]]
        result = to_sql(joined)
        expected = """SELECT t0.*
FROM star1 t0
  LEFT ANTI JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

    def test_self_reference_simple(self):
        t1 = self.con.table('star1')

        result_sql = to_sql(t1.view())
        expected_sql = "SELECT *\nFROM star1"
        assert result_sql == expected_sql

    def test_join_self_reference(self):
        t1 = self.con.table('star1')
        t2 = t1.view()

        result = t1.inner_join(t2, [t1.foo_id == t2.bar_id])[[t1]]

        result_sql = to_sql(result)
        expected_sql = """SELECT t0.*
FROM star1 t0
  INNER JOIN star1 t1
    ON t0.`foo_id` = t1.`bar_id`"""
        assert result_sql == expected_sql

    def test_join_projection_subquery_broken_alias(self):
        # From an observed bug, derived from tpch tables
        geo = (nation.inner_join(
            region,
            [('n_regionkey', 'r_regionkey')])[nation.n_nationkey,
                                              nation.n_name.name('nation'),
                                              region.r_name.name('region')])

        expr = (geo.inner_join(customer,
                               [('n_nationkey', 'c_nationkey')])[customer,
                                                                 geo])

        result = to_sql(expr)
        expected = """SELECT t1.*, t0.*
FROM (
  SELECT t2.`n_nationkey`, t2.`n_name` AS `nation`, t3.`r_name` AS `region`
  FROM nation t2
    INNER JOIN region t3
      ON t2.`n_regionkey` = t3.`r_regionkey`
) t0
  INNER JOIN customer t1
    ON t0.`n_nationkey` = t1.`c_nationkey`"""
        assert result == expected

    def test_where_simple_comparisons(self):
        t1 = self.con.table('star1')

        what = t1.filter([t1.f > 0, t1.c < t1.f * 2])

        result = to_sql(what)
        expected = """SELECT *
FROM star1
WHERE `f` > 0 AND
      `c` < (`f` * 2)"""
        assert result == expected

    def test_where_in_array_literal(self):
        # e.g.
        # where string_col in (v1, v2, v3)
        raise unittest.SkipTest

    def test_where_with_join(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        # This also tests some cases of predicate pushdown
        what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]).projection(
            [t1, t2.value1, t2.value3]).filter([t1.f > 0, t2.value3 < 1000]))

        what2 = (t1.inner_join(t2, [t1.foo_id == t2.foo_id]).filter(
            [t1.f > 0,
             t2.value3 < 1000]).projection([t1, t2.value1, t2.value3]))

        expected_sql = """SELECT t0.*, t1.`value1`, t1.`value3`
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`
WHERE t0.`f` > 0 AND
      t1.`value3` < 1000"""

        result_sql = to_sql(what)
        assert result_sql == expected_sql

        result2_sql = to_sql(what2)
        assert result2_sql == expected_sql

    def test_where_no_pushdown_possible(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined = (t1.inner_join(
            t2, [t1.foo_id == t2.foo_id])[t1, (t1.f - t2.value1).name('diff')])

        filtered = joined[joined.diff > 1]

        # TODO: I'm not sure if this is exactly what we want
        expected_sql = """SELECT *
FROM (
  SELECT t0.*, t0.`f` - t1.`value1` AS `diff`
  FROM star1 t0
    INNER JOIN star2 t1
      ON t0.`foo_id` = t1.`foo_id`
  WHERE t0.`f` > 0 AND
        t1.`value3` < 1000
)
WHERE `diff` > 1"""

        raise unittest.SkipTest

        result_sql = to_sql(filtered)
        assert result_sql == expected_sql

    def test_where_with_between(self):
        t = self.con.table('alltypes')

        what = t.filter([t.a > 0, t.f.between(0, 1)])
        result = to_sql(what)
        expected = """SELECT *
FROM alltypes
WHERE `a` > 0 AND
      `f` BETWEEN 0 AND 1"""
        assert result == expected

    def test_where_analyze_scalar_op(self):
        # root cause of #310

        table = self.con.table('functional_alltypes')

        expr = (table.filter([
            table.timestamp_col <
            (ibis.timestamp('2010-01-01') + ibis.month(3)),
            table.timestamp_col < (ibis.now() + ibis.day(10))
        ]).count())

        result = to_sql(expr)
        expected = """\
SELECT count(*) AS `tmp`
FROM functional_alltypes
WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND
      `timestamp_col` < days_add(now(), 10)"""
        assert result == expected

    def test_simple_aggregate_query(self):
        t1 = self.con.table('star1')

        cases = [(t1.aggregate([t1['f'].sum().name('total')], [t1['foo_id']]),
                  """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1"""),
                 (t1.aggregate([t1['f'].sum().name('total')],
                               ['foo_id', 'bar_id']),
                  """SELECT `foo_id`, `bar_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1, 2""")]
        for expr, expected_sql in cases:
            result_sql = to_sql(expr)
            assert result_sql == expected_sql

    def test_aggregate_having(self):
        # Filtering post-aggregation predicate
        t1 = self.con.table('star1')

        total = t1.f.sum().name('total')
        metrics = [total]

        expr = t1.aggregate(metrics, by=['foo_id'], having=[total > 10])
        result = to_sql(expr)
        expected = """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1
HAVING sum(`f`) > 10"""
        assert result == expected

        expr = t1.aggregate(metrics, by=['foo_id'], having=[t1.count() > 100])
        result = to_sql(expr)
        expected = """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1
HAVING count(*) > 100"""
        assert result == expected

    def test_aggregate_table_count_metric(self):
        expr = self.con.table('star1').count()

        result = to_sql(expr)
        expected = """SELECT count(*) AS `tmp`
FROM star1"""
        assert result == expected

        # count on more complicated table
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        join_expr = region.r_regionkey == nation.n_regionkey
        joined = region.inner_join(nation, join_expr)
        table_ref = joined[nation, region.r_name.name('region')]

        expr = table_ref.count()
        result = to_sql(expr)
        expected = """SELECT count(*) AS `tmp`
FROM (
  SELECT t2.*, t1.`r_name` AS `region`
  FROM tpch_region t1
    INNER JOIN tpch_nation t2
      ON t1.`r_regionkey` = t2.`n_regionkey`
) t0"""
        assert result == expected

    def test_expr_template_field_name_binding(self):
        # Given an expression with no concrete links to actual database tables,
        # indicate a mapping between the distinct unbound table leaves of the
        # expression and some database tables with compatible schemas but
        # potentially different column names
        pass

    def test_no_aliases_needed(self):
        table = api.table([('key1', 'string'), ('key2', 'string'),
                           ('value', 'double')])

        expr = table.aggregate([table['value'].sum().name('total')],
                               by=['key1', 'key2'])

        query = _get_query(expr)
        context = query.context
        assert not context.need_aliases()

    def test_table_names_overlap_default_aliases(self):
        # see discussion in #104; this actually is not needed for query
        # correctness, and only makes the generated SQL nicer
        raise unittest.SkipTest

        t0 = api.table([('key', 'string'), ('v1', 'double')], 't1')

        t1 = api.table([('key', 'string'), ('v2', 'double')], 't0')

        expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2]

        result = to_sql(expr)
        expected = """\
SELECT t2.`key`, t2.`v1`, t3.`v2`
FROM t0 t2
  INNER JOIN t1 t3
    ON t2.`key` = t3.`key`"""

        assert result == expected

    def test_context_aliases_multiple_join(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')
        t3 = self.con.table('star3')

        expr = (t1.left_join(t2, [t1['foo_id'] == t2['foo_id']]).inner_join(
            t3,
            [t1['bar_id'] == t3['bar_id']])[[t1, t2['value1'], t3['value2']]])

        query = _get_query(expr)
        context = query.context

        assert context.get_alias(t1) == 't0'
        assert context.get_alias(t2) == 't1'
        assert context.get_alias(t3) == 't2'

    def test_fuse_projections(self):
        table = api.table([('foo', 'int32'), ('bar', 'int64'),
                           ('value', 'double')],
                          name='tbl')

        # Cases where we project in both cases using the base table reference
        f1 = (table['foo'] + table['bar']).name('baz')
        pred = table['value'] > 0

        table2 = table[table, f1]
        table2_filtered = table2[pred]

        f2 = (table2['foo'] * 2).name('qux')
        f3 = (table['foo'] * 2).name('qux')

        table3 = table2.projection([table2, f2])

        # fusion works even if there's a filter
        table3_filtered = table2_filtered.projection([table2, f2])

        expected = table[table, f1, f3]
        expected2 = table[pred][table, f1, f3]

        assert table3.equals(expected)
        assert table3_filtered.equals(expected2)

        ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux`
FROM tbl"""

        ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux`
FROM tbl
WHERE `value` > 0"""

        table3_sql = to_sql(table3)
        table3_filt_sql = to_sql(table3_filtered)

        assert table3_sql == ex_sql
        assert table3_filt_sql == ex_sql2

        # Use the intermediate table refs
        table3 = table2.projection([table2, f2])

        # fusion works even if there's a filter
        table3_filtered = table2_filtered.projection([table2, f2])

        expected = table[table, f1, f3]
        expected2 = table[pred][table, f1, f3]

        assert table3.equals(expected)
        assert table3_filtered.equals(expected2)

    def test_bug_project_multiple_times(self):
        # 108
        customer = self.con.table('tpch_customer')
        nation = self.con.table('tpch_nation')
        region = self.con.table('tpch_region')

        joined = (customer.inner_join(
            nation, [customer.c_nationkey == nation.n_nationkey]).inner_join(
                region, [nation.n_regionkey == region.r_regionkey]))
        proj1 = [customer, nation.n_name, region.r_name]
        step1 = joined[proj1]

        topk_by = step1.c_acctbal.cast('double').sum()
        pred = step1.n_name.topk(10, by=topk_by)

        proj_exprs = [step1.c_name, step1.r_name, step1.n_name]
        step2 = step1[pred]
        expr = step2.projection(proj_exprs)

        # it works!
        result = to_sql(expr)
        expected = """\
SELECT `c_name`, `r_name`, `n_name`
FROM (
  SELECT t1.*, t2.`n_name`, t3.`r_name`
  FROM tpch_customer t1
    INNER JOIN tpch_nation t2
      ON t1.`c_nationkey` = t2.`n_nationkey`
    INNER JOIN tpch_region t3
      ON t2.`n_regionkey` = t3.`r_regionkey`
    LEFT SEMI JOIN (
      SELECT t2.`n_name`, sum(CAST(t1.`c_acctbal` AS double)) AS `sum`
      FROM tpch_customer t1
        INNER JOIN tpch_nation t2
          ON t1.`c_nationkey` = t2.`n_nationkey`
        INNER JOIN tpch_region t3
          ON t2.`n_regionkey` = t3.`r_regionkey`
      GROUP BY 1
      ORDER BY `sum` DESC
      LIMIT 10
    ) t4
      ON t2.`n_name` = t4.`n_name`
) t0"""
        assert result == expected

    def test_aggregate_projection_subquery(self):
        t = self.con.table('alltypes')

        proj = t[t.f > 0][t, (t.a + t.b).name('foo')]

        def agg(x):
            return x.aggregate([x.foo.sum().name('foo total')], by=['g'])

        # predicate gets pushed down
        filtered = proj[proj.g == 'bar']

        result = to_sql(filtered)
        expected = """SELECT *, `a` + `b` AS `foo`
FROM alltypes
WHERE `f` > 0 AND
      `g` = 'bar'"""
        assert result == expected

        agged = agg(filtered)
        result = to_sql(agged)
        expected = """SELECT `g`, sum(`foo`) AS `foo total`
FROM (
  SELECT *, `a` + `b` AS `foo`
  FROM alltypes
  WHERE `f` > 0 AND
        `g` = 'bar'
) t0
GROUP BY 1"""
        assert result == expected

        # Pushdown is not possible (in Impala, Postgres, others)
        agged2 = agg(proj[proj.foo < 10])

        result = to_sql(agged2)
        expected = """SELECT t0.`g`, sum(t0.`foo`) AS `foo total`
FROM (
  SELECT *, `a` + `b` AS `foo`
  FROM alltypes
  WHERE `f` > 0
) t0
WHERE t0.`foo` < 10
GROUP BY 1"""
        assert result == expected

    def test_subquery_aliased(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id'])
        what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id])[agged,
                                                                  t2.value1])

        result = to_sql(what)
        expected = """SELECT t0.*, t1.`value1`
FROM (
  SELECT `foo_id`, sum(`f`) AS `total`
  FROM star1
  GROUP BY 1
) t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

    def test_double_nested_subquery_no_aliases(self):
        # We don't require any table aliasing anywhere
        t = api.table([('key1', 'string'), ('key2', 'string'),
                       ('key3', 'string'), ('value', 'double')], 'foo_table')

        agg1 = t.aggregate([t.value.sum().name('total')],
                           by=['key1', 'key2', 'key3'])
        agg2 = agg1.aggregate([agg1.total.sum().name('total')],
                              by=['key1', 'key2'])
        agg3 = agg2.aggregate([agg2.total.sum().name('total')], by=['key1'])

        result = to_sql(agg3)
        expected = """SELECT `key1`, sum(`total`) AS `total`
FROM (
  SELECT `key1`, `key2`, sum(`total`) AS `total`
  FROM (
    SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total`
    FROM foo_table
    GROUP BY 1, 2, 3
  ) t1
  GROUP BY 1, 2
) t0
GROUP BY 1"""
        assert result == expected

    def test_aggregate_projection_alias_bug(self):
        # Observed in use
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])[[t1, t2.value1]])

        what = what.aggregate([what.value1.sum().name('total')],
                              by=[what.foo_id])

        # TODO: Not fusing the aggregation with the projection yet
        result = to_sql(what)
        expected = """SELECT `foo_id`, sum(`value1`) AS `total`
FROM (
  SELECT t1.*, t2.`value1`
  FROM star1 t1
    INNER JOIN star2 t2
      ON t1.`foo_id` = t2.`foo_id`
) t0
GROUP BY 1"""
        assert result == expected

    def test_aggregate_fuse_with_projection(self):
        # see above test case
        pass

    def test_subquery_used_for_self_join(self):
        # There could be cases that should look in SQL like
        # WITH t0 as (some subquery)
        # select ...
        # from t0 t1
        #   join t0 t2
        #     on t1.kind = t2.subkind
        # ...
        # However, the Ibis code will simply have an expression (projection or
        # aggregation, say) built on top of the subquery expression, so we need
        # to extract the subquery unit (we see that it appears multiple times
        # in the tree).
        t = self.con.table('alltypes')

        agged = t.aggregate([t.f.sum().name('total')], by=['g', 'a', 'b'])
        view = agged.view()
        metrics = [(agged.total - view.total).max().name('metric')]
        reagged = (agged.inner_join(view, [agged.a == view.b]).aggregate(
            metrics, by=[agged.g]))

        result = to_sql(reagged)
        expected = """WITH t0 AS (
  SELECT `g`, `a`, `b`, sum(`f`) AS `total`
  FROM alltypes
  GROUP BY 1, 2, 3
)
SELECT t0.`g`, max(t0.`total` - t1.`total`) AS `metric`
FROM t0
  INNER JOIN t0 t1
    ON t0.`a` = t1.`b`
GROUP BY 1"""
        assert result == expected

    def test_subquery_factor_correlated_subquery(self):
        # #173, #183 and other issues
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        customer = self.con.table('tpch_customer')
        orders = self.con.table('tpch_orders')

        fields_of_interest = [
            customer,
            region.r_name.name('region'),
            orders.o_totalprice.name('amount'),
            orders.o_orderdate.cast('timestamp').name('odate')
        ]

        tpch = (region.join(
            nation, region.r_regionkey == nation.n_regionkey).join(
                customer, customer.c_nationkey == nation.n_nationkey).join(
                    orders,
                    orders.o_custkey == customer.c_custkey)[fields_of_interest]
                )

        # Self-reference + correlated subquery complicates things
        t2 = tpch.view()
        conditional_avg = t2[t2.region == tpch.region].amount.mean()
        amount_filter = tpch.amount > conditional_avg

        expr = tpch[amount_filter].limit(10)

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT t5.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`,
         CAST(t3.`o_orderdate` AS timestamp) AS `odate`
  FROM tpch_region t1
    INNER JOIN tpch_nation t2
      ON t1.`r_regionkey` = t2.`n_regionkey`
    INNER JOIN tpch_customer t5
      ON t5.`c_nationkey` = t2.`n_nationkey`
    INNER JOIN tpch_orders t3
      ON t3.`o_custkey` = t5.`c_custkey`
)
SELECT t0.*
FROM t0
WHERE t0.`amount` > (
  SELECT avg(t4.`amount`) AS `tmp`
  FROM t0 t4
  WHERE t4.`region` = t0.`region`
)
LIMIT 10"""
        assert result == expected

    def test_self_join_subquery_distinct_equal(self):
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')

        j1 = (region.join(nation,
                          region.r_regionkey == nation.n_regionkey)[region,
                                                                    nation])

        j2 = (region.join(
            nation, region.r_regionkey == nation.n_regionkey)[region,
                                                              nation].view())

        expr = (j1.join(j2, j1.r_regionkey == j2.r_regionkey)[j1.r_name,
                                                              j2.n_name])

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT t2.*, t3.*
  FROM tpch_region t2
    INNER JOIN tpch_nation t3
      ON t2.`r_regionkey` = t3.`n_regionkey`
)
SELECT t0.`r_name`, t1.`n_name`
FROM t0
  INNER JOIN t0 t1
    ON t0.`r_regionkey` = t1.`r_regionkey`"""

        assert result == expected

    def test_limit_with_self_join(self):
        t = self.con.table('functional_alltypes')
        t2 = t.view()

        expr = t.join(t2, t.tinyint_col < t2.timestamp_col.minute()).count()

        # it works
        result = to_sql(expr)
        expected = """\
SELECT count(*) AS `tmp`
FROM functional_alltypes t0
  INNER JOIN functional_alltypes t1
    ON t0.`tinyint_col` < extract(t1.`timestamp_col`, 'minute')"""
        assert result == expected

    def test_cte_factor_distinct_but_equal(self):
        t = self.con.table('alltypes')
        tt = self.con.table('alltypes')

        expr1 = t.group_by('g').aggregate(t.f.sum().name('metric'))
        expr2 = tt.group_by('g').aggregate(tt.f.sum().name('metric')).view()

        expr = expr1.join(expr2, expr1.g == expr2.g)[[expr1]]

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT `g`, sum(`f`) AS `metric`
  FROM alltypes
  GROUP BY 1
)
SELECT t0.*
FROM t0
  INNER JOIN t0 t1
    ON t0.`g` = t1.`g`"""

        assert result == expected

    def test_tpch_self_join_failure(self):
        # duplicating the integration test here

        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        customer = self.con.table('tpch_customer')
        orders = self.con.table('tpch_orders')

        fields_of_interest = [
            region.r_name.name('region'),
            nation.n_name.name('nation'),
            orders.o_totalprice.name('amount'),
            orders.o_orderdate.cast('timestamp').name('odate')
        ]

        joined_all = (region.join(
            nation, region.r_regionkey == nation.n_regionkey).join(
                customer, customer.c_nationkey == nation.n_nationkey).join(
                    orders,
                    orders.o_custkey == customer.c_custkey)[fields_of_interest]
                      )

        year = joined_all.odate.year().name('year')
        total = joined_all.amount.sum().cast('double').name('total')
        annual_amounts = (joined_all.group_by(['region',
                                               year]).aggregate(total))

        current = annual_amounts
        prior = annual_amounts.view()

        yoy_change = (current.total - prior.total).name('yoy_change')
        yoy = (current.join(prior,
                            current.year == (prior.year - 1))[current.region,
                                                              current.year,
                                                              yoy_change])
        to_sql(yoy)

    def test_extract_subquery_nested_lower(self):
        # We may have a join between two tables requiring subqueries, and
        # buried inside these there may be a common subquery. Let's test that
        # we find it and pull it out to the top level to avoid repeating
        # ourselves.
        pass

    def test_subquery_in_filter_predicate(self):
        # E.g. comparing against some scalar aggregate value. See Ibis #43
        t1 = self.con.table('star1')

        pred = t1.f > t1.f.mean()
        expr = t1[pred]

        # This brought out another expression rewriting bug, since the filtered
        # table isn't found elsewhere in the expression.
        pred2 = t1.f > t1[t1.foo_id == 'foo'].f.mean()
        expr2 = t1[pred2]

        result = to_sql(expr)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT avg(`f`) AS `tmp`
  FROM star1
)"""
        assert result == expected

        result = to_sql(expr2)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT avg(`f`) AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

    def test_filter_subquery_derived_reduction(self):
        t1 = self.con.table('star1')

        # Reduction can be nested inside some scalar expression
        pred3 = t1.f > t1[t1.foo_id == 'foo'].f.mean().log()
        pred4 = t1.f > (t1[t1.foo_id == 'foo'].f.mean().log() + 1)

        expr3 = t1[pred3]
        result = to_sql(expr3)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT ln(avg(`f`)) AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

        expr4 = t1[pred4]

        result = to_sql(expr4)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT ln(avg(`f`)) + 1 AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

    def test_topk_operation_to_semi_join(self):
        # TODO: top K with filter in place

        table = api.table([
            ('foo', 'string'),
            ('bar', 'string'),
            ('city', 'string'),
            ('v1', 'double'),
            ('v2', 'double'),
        ], 'tbl')

        what = table.city.topk(10, by=table.v2.mean())
        filtered = table[what]

        query = to_sql(filtered)
        expected = """SELECT t0.*
FROM tbl t0
  LEFT SEMI JOIN (
    SELECT `city`, avg(`v2`) AS `mean`
    FROM tbl
    GROUP BY 1
    ORDER BY `mean` DESC
    LIMIT 10
  ) t1
    ON t0.`city` = t1.`city`"""
        assert query == expected

        # Test the default metric (count)

        what = table.city.topk(10)
        filtered2 = table[what]
        query = to_sql(filtered2)
        expected = """SELECT t0.*
FROM tbl t0
  LEFT SEMI JOIN (
    SELECT `city`, count(`city`) AS `count`
    FROM tbl
    GROUP BY 1
    ORDER BY `count` DESC
    LIMIT 10
  ) t1
    ON t0.`city` = t1.`city`"""
        assert query == expected

    def test_topk_predicate_pushdown_bug(self):
        # Observed on TPCH data
        cplusgeo = (customer.inner_join(
            nation, [customer.c_nationkey == nation.n_nationkey]).inner_join(
                region,
                [nation.n_regionkey == region.r_regionkey])[customer,
                                                            nation.n_name,
                                                            region.r_name])

        pred = cplusgeo.n_name.topk(10, by=cplusgeo.c_acctbal.sum())
        expr = cplusgeo.filter([pred])

        result = to_sql(expr)
        expected = """\
SELECT t0.*, t1.`n_name`, t2.`r_name`
FROM customer t0
  INNER JOIN nation t1
    ON t0.`c_nationkey` = t1.`n_nationkey`
  INNER JOIN region t2
    ON t1.`n_regionkey` = t2.`r_regionkey`
  LEFT SEMI JOIN (
    SELECT t1.`n_name`, sum(t0.`c_acctbal`) AS `sum`
    FROM customer t0
      INNER JOIN nation t1
        ON t0.`c_nationkey` = t1.`n_nationkey`
      INNER JOIN region t2
        ON t1.`n_regionkey` = t2.`r_regionkey`
    GROUP BY 1
    ORDER BY `sum` DESC
    LIMIT 10
  ) t3
    ON t1.`n_name` = t3.`n_name`"""
        assert result == expected

    def test_topk_analysis_bug(self):
        # GH #398
        airlines = ibis.table([('dest', 'string'), ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
        expr = t[delay_filter].group_by('origin').size()

        result = to_sql(expr)
        expected = """\
SELECT t0.`origin`, count(*) AS `count`
FROM airlines t0
  LEFT SEMI JOIN (
    SELECT `dest`, avg(`arrdelay`) AS `mean`
    FROM airlines
    WHERE `dest` IN ('ORD', 'JFK', 'SFO')
    GROUP BY 1
    ORDER BY `mean` DESC
    LIMIT 10
  ) t1
    ON t0.`dest` = t1.`dest`
WHERE t0.`dest` IN ('ORD', 'JFK', 'SFO')
GROUP BY 1"""

        assert result == expected

    def test_topk_to_aggregate(self):
        t = ibis.table([('dest', 'string'), ('origin', 'string'),
                        ('arrdelay', 'int32')], 'airlines')

        top = t.dest.topk(10, by=t.arrdelay.mean())

        result = to_sql(top)
        expected = to_sql(top.to_aggregation())
        assert result == expected

    def test_bottomk(self):
        pass

    def test_topk_antijoin(self):
        # Get the "other" category somehow
        pass

    def test_case_in_projection(self):
        t = self.con.table('alltypes')

        expr = (t.g.case().when('foo',
                                'bar').when('baz',
                                            'qux').else_('default').end())

        expr2 = (api.case().when(t.g == 'foo', 'bar').when(t.g == 'baz',
                                                           t.g).end())

        proj = t[expr.name('col1'), expr2.name('col2'), t]

        result = to_sql(proj)
        expected = """SELECT
  CASE `g`
    WHEN 'foo' THEN 'bar'
    WHEN 'baz' THEN 'qux'
    ELSE 'default'
  END AS `col1`,
  CASE
    WHEN `g` = 'foo' THEN 'bar'
    WHEN `g` = 'baz' THEN `g`
    ELSE NULL
  END AS `col2`, *
FROM alltypes"""
        assert result == expected

    def test_identifier_quoting(self):
        data = api.table([('date', 'int32'), ('explain', 'string')], 'table')

        expr = data[data.date.name('else'), data.explain.name('join')]

        result = to_sql(expr)
        expected = """SELECT `date` AS `else`, `explain` AS `join`
FROM `table`"""
        assert result == expected
Пример #26
0
class TestASTBuilder(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_ast_with_projection_join_filter(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0

        table3 = table[filter_pred]

        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])
        result = joined[[table3, table2['value']]]

        ast = build_ast(result)
        stmt = ast.queries[0]

        def foo():
            table3 = table[filter_pred]
            joined = table2.inner_join(table3, [join_pred])
            result = joined[[table3, table2['value']]]
            return result

        assert len(stmt.select_set) == 2
        assert len(stmt.where) == 1
        assert stmt.where[0] is filter_pred

        # Check that the join has been rebuilt to only include the root tables
        tbl = stmt.table_set
        tbl_node = tbl.op()
        assert isinstance(tbl_node, ops.InnerJoin)
        assert tbl_node.left is table2
        assert tbl_node.right is table

        # table expression substitution has been made in the predicate
        assert tbl_node.predicates[0].equals(table['g'] == table2['key'])

    def test_ast_with_aggregation_join_filter(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        ast = build_ast(result)
        stmt = ast.queries[0]

        # hoisted metrics
        ex_metrics = [(table['f'] - table2['value']).mean().name('foo'),
                      table['f'].sum().name('bar')]
        ex_by = [table['g'], table2['key']]

        # hoisted join and aggregate
        expected_table_set = \
            table2.inner_join(table, [table['g'] == table2['key']])
        assert stmt.table_set.equals(expected_table_set)

        # Check various exprs
        for res, ex in zip(stmt.select_set, ex_by + ex_metrics):
            assert res.equals(ex)

        for res, ex in zip(stmt.group_by, ex_by):
            assert stmt.select_set[res].equals(ex)

        # Check we got the filter
        assert len(stmt.where) == 1
        assert stmt.where[0].equals(filter_pred)

    def test_sort_by(self):
        table = self.con.table('star1')

        what = table.sort_by('f')
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `f`"""
        assert result == expected

        what = table.sort_by(('f', 0))
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `f` DESC"""
        assert result == expected

        what = table.sort_by(['c', ('f', 0)])
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `c`, `f` DESC"""
        assert result == expected

    def test_limit(self):
        table = self.con.table('star1').limit(10)
        result = to_sql(table)
        expected = """SELECT *
FROM star1
LIMIT 10"""
        assert result == expected

        table = self.con.table('star1').limit(10, offset=5)
        result = to_sql(table)
        expected = """SELECT *
FROM star1
LIMIT 10 OFFSET 5"""
        assert result == expected

        # Put the limit in a couple places in the stack
        table = self.con.table('star1')
        table = table[table.f > 0].limit(10)
        result = to_sql(table)

        expected = """SELECT *
FROM star1
WHERE `f` > 0
LIMIT 10"""

        assert result == expected

        table = self.con.table('star1')

        # Semantically, this should produce a subquery
        table = table.limit(10)
        table = table[table.f > 0]

        result2 = to_sql(table)

        expected2 = """SELECT *
FROM (
  SELECT *
  FROM star1
  LIMIT 10
) t0
WHERE `f` > 0"""

        assert result2 == expected2

    def test_join_with_limited_table(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        limited = t1.limit(100)
        joined = (limited.inner_join(t2, [limited.foo_id == t2.foo_id])
                  [[limited]])

        result = to_sql(joined)
        expected = """SELECT t0.*
FROM (
  SELECT *
  FROM star1
  LIMIT 100
) t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""

        assert result == expected

    def test_sort_by_on_limit_yield_subquery(self):
        # x.limit(...).sort_by(...)
        #   is semantically different from
        # x.sort_by(...).limit(...)
        #   and will often yield different results
        t = self.con.table('functional_alltypes')
        expr = (t.group_by('string_col')
                .aggregate([t.count().name('nrows')])
                .limit(5)
                .sort_by('string_col'))

        result = to_sql(expr)
        expected = """SELECT *
FROM (
  SELECT `string_col`, count(*) AS `nrows`
  FROM functional_alltypes
  GROUP BY 1
  LIMIT 5
) t0
ORDER BY `string_col`"""
        assert result == expected

    def test_multiple_limits(self):
        t = self.con.table('functional_alltypes')

        expr = t.limit(20).limit(10)
        stmt = build_ast(expr).queries[0]

        assert stmt.limit['n'] == 10

    def test_top_convenience(self):
        # x.top(10, by=field)
        # x.top(10, by=[field1, field2])
        pass

    def test_self_aggregate_in_predicate(self):
        # Per ibis #43
        pass
Пример #27
0
class TestWrapping(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d,
                         self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        func = api.scalar_function(['string'], 'string', name='Tester')
        func.register('identity', 'udf_testing')

        result = func('hello world')
        assert result == "SELECT udf_testing.identity('hello world')"

    def test_sql_generation_from_infoclass(self):
        func = api.wrap_udf('test.so', ['string'], 'string', 'info_test')
        repr(func)

        func.register('info_test', 'udf_testing')
        result = func('hello world')
        assert result == "SELECT udf_testing.info_test('hello world')"

    def test_udf_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_udf([t], t, 'test')

            ibis_type = validate_type(t)

            expr = func(sv)
            assert type(expr) == ibis_type.scalar_type()
            expr = func(av)
            assert type(expr) == ibis_type.array_type()

    def test_uda_primitive_output_types(self):
        types = [
            ('boolean', True, self.b),
            ('int8', 1, self.i8),
            ('int16', 1, self.i16),
            ('int32', 1, self.i32),
            ('int64', 1, self.i64),
            ('float', 1.0, self.f),
            ('double', 1.0, self.d),
            ('string', '1', self.s),
            ('timestamp', ibis.timestamp('1961-04-10'), self.t)
        ]
        for t, sv, av in types:
            func = self._register_uda([t], t, 'test')

            ibis_type = validate_type(t)

            expr1 = func(sv)
            expr2 = func(sv)
            assert isinstance(expr1, ibis_type.scalar_type())
            assert isinstance(expr2, ibis_type.scalar_type())

    def test_decimal(self):
        func = self._register_udf(['decimal(9,0)'], 'decimal(9,0)', 'test')
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalArray

    def test_udf_invalid_typecasting(self):
        cases = [
            ('int8', self.all_cols[:1], self.all_cols[1:]),
            ('int16', self.all_cols[:2], self.all_cols[2:]),
            ('int32', self.all_cols[:3], self.all_cols[3:]),
            ('int64', self.all_cols[:4], self.all_cols[4:]),
            ('boolean', [], self.all_cols[:8] + self.all_cols[9:]),

            # allowing double here for now
            ('float', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),

            ('double', self.all_cols[:4], [self.s, self.b, self.t, self.dec]),
            ('string', [], self.all_cols[:7] + self.all_cols[8:]),
            ('timestamp', [], self.all_cols[:-1]),
            ('decimal', [], self.all_cols[:4] + self.all_cols[7:])
        ]

        for t, valid_casts, invalid_casts in cases:
            func = self._register_udf([t], 'int32', 'typecast')

            for expr in valid_casts:
                func(expr)

            for expr in invalid_casts:
                self.assertRaises(IbisTypeError, func, expr)

    def test_mult_args(self):
        func = self._register_udf(['int32', 'double', 'string',
                                   'boolean', 'timestamp'],
                                  'int64', 'mult_types')

        expr = func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ArrayExpr)

        expr = func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10'))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _register_udf(self, inputs, output, name):
        func = api.scalar_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func

    def _register_uda(self, inputs, output, name):
        func = api.aggregate_function(inputs, output, name=name)
        func.register(name, 'ibis_testing')
        return func
Пример #28
0
class TestDecimal(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.lineitem = self.con.table('tpch_lineitem')

    def test_type_metadata(self):
        col = self.lineitem.l_extendedprice
        assert isinstance(col, ir.DecimalArray)

        assert col._precision == 12
        assert col._scale == 2

    def test_cast_scalar_to_decimal(self):
        val = api.literal('1.2345')

        casted = val.cast('decimal(15,5)')
        assert isinstance(casted, ir.DecimalScalar)
        assert casted._precision == 15
        assert casted._scale == 5

    def test_decimal_aggregate_function_behavior(self):
        # From the Impala documentation: "The result of an aggregate function
        # such as MAX(), SUM(), or AVG() on DECIMAL values is promoted to a
        # scale of 38, with the same precision as the underlying column. Thus,
        # the result can represent the largest possible value at that
        # particular precision."
        col = self.lineitem.l_extendedprice
        functions = ['sum', 'mean', 'max', 'min']

        for func_name in functions:
            result = getattr(col, func_name)()
            assert isinstance(result, ir.DecimalScalar)
            assert result._precision == col._precision
            assert result._scale == 38

    def test_where(self):
        table = self.lineitem

        q = table.l_quantity
        expr = api.where(table.l_discount > 0, q * table.l_discount, api.null)

        assert isinstance(expr, ir.DecimalArray)

        expr = api.where(table.l_discount > 0, (q * table.l_discount).sum(),
                         api.null)
        assert isinstance(expr, ir.DecimalArray)

        expr = api.where(table.l_discount.sum() > 0,
                         (q * table.l_discount).sum(), api.null)
        assert isinstance(expr, ir.DecimalScalar)

    def test_fillna(self):
        expr = self.lineitem.l_extendedprice.fillna(0)
        assert isinstance(expr, ir.DecimalArray)

        expr = self.lineitem.l_extendedprice.fillna(self.lineitem.l_quantity)
        assert isinstance(expr, ir.DecimalArray)

    def test_precision_scale(self):
        col = self.lineitem.l_extendedprice

        p = col.precision()
        s = col.scale()

        assert isinstance(p, ir.IntegerValue)
        assert isinstance(p.op(), ops.DecimalPrecision)

        assert isinstance(s, ir.IntegerValue)
        assert isinstance(s.op(), ops.DecimalScale)

    def test_invalid_precision_scale_combo(self):
        pass
Пример #29
0
class TestCreateTable(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table('functional_alltypes')
        self.expr = t[t.bigint_col > 0]

    def test_create_external_table_as(self):
        path = '/path/to/table'
        select = build_ast(self.con.table('test1')).queries[0]
        statement = ddl.CTAS('another_table',
                             select,
                             external=True,
                             can_exist=False,
                             path=path,
                             database='foo')
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
STORED AS PARQUET
LOCATION '{0}'
AS
SELECT *
FROM test1""".format(path)
        assert result == expected

    def test_create_table_with_location(self):
        path = '/path/to/table'
        schema = ibis.schema([('foo', 'string'), ('bar', 'int8'),
                              ('baz', 'int16')])
        statement = ddl.CreateTableWithSchema('another_table',
                                              schema,
                                              ddl.NoFormat(),
                                              can_exist=False,
                                              path=path,
                                              database='foo')
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_table_like_parquet(self):
        directory = '/path/to/'
        path = '/path/to/parquetfile'
        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_file=path,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE PARQUET '{0}'
STORED AS PARQUET
LOCATION '{1}'""".format(path, directory)

        assert result == expected

    def test_create_table_parquet_like_other(self):
        # alternative to "LIKE PARQUET"
        directory = '/path/to/'
        example_table = 'db.other'

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_table=example_table,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE {0}
STORED AS PARQUET
LOCATION '{1}'""".format(example_table, directory)

        assert result == expected

    def test_create_table_parquet_with_schema(self):
        directory = '/path/to/'

        schema = ibis.schema([('foo', 'string'), ('bar', 'int8'),
                              ('baz', 'int16')])

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           schema=schema,
                                           external=True,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

        assert result == expected

    def test_create_table_delimited(self):
        path = '/path/to/files/'
        schema = ibis.schema([('a', 'string'), ('b', 'int32'), ('c', 'double'),
                              ('d', 'decimal(12,2)')])

        stmt = ddl.CreateTableDelimited('new_table',
                                        path,
                                        schema,
                                        delimiter='|',
                                        escapechar='\\',
                                        lineterminator='\0',
                                        database='foo',
                                        can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_external_table_avro(self):
        path = '/path/to/files/'

        avro_schema = {
            'fields': [{
                'name': 'a',
                'type': 'string'
            }, {
                'name': 'b',
                'type': 'int'
            }, {
                'name': 'c',
                'type': 'double'
            }, {
                "type": "bytes",
                "logicalType": "decimal",
                "precision": 4,
                "scale": 2,
                'name': 'd'
            }],
            'name':
            'my_record',
            'type':
            'record'
        }

        stmt = ddl.CreateTableAvro('new_table',
                                   path,
                                   avro_schema,
                                   database='foo',
                                   can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
STORED AS AVRO
LOCATION '%s'
TBLPROPERTIES (
  'avro.schema.literal'='{
  "fields": [
    {
      "name": "a",
      "type": "string"
    },
    {
      "name": "b",
      "type": "int"
    },
    {
      "name": "c",
      "type": "double"
    },
    {
      "logicalType": "decimal",
      "name": "d",
      "precision": 4,
      "scale": 2,
      "type": "bytes"
    }
  ],
  "name": "my_record",
  "type": "record"
}'
)""" % path
        assert result == expected

    def test_create_table_parquet(self):
        statement = _create_table('some_table',
                                  self.expr,
                                  database='bar',
                                  can_exist=False)
        result = statement.compile()

        expected = """\
CREATE TABLE bar.`some_table`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_no_overwrite(self):
        statement = _create_table('tname', self.expr, can_exist=True)
        result = statement.compile()

        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_avro_other_formats(self):
        statement = _create_table('tname',
                                  self.t,
                                  format='avro',
                                  can_exist=True)
        result = statement.compile()
        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS AVRO
AS
SELECT *
FROM functional_alltypes"""
        assert result == expected

        self.assertRaises(ValueError,
                          _create_table,
                          'tname',
                          self.t,
                          format='foo')

    def test_partition_by(self):
        pass
Пример #30
0
class TestStringBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_unary_ops(self):
        s = self.table.string_col
        cases = [
            (s.lower(), 'lower(`string_col`)'),
            (s.upper(), 'upper(`string_col`)'),
            (s.reverse(), 'reverse(`string_col`)'),
            (s.strip(), 'trim(`string_col`)'),
            (s.lstrip(), 'ltrim(`string_col`)'),
            (s.rstrip(), 'rtrim(`string_col`)'),
            (s.capitalize(), 'initcap(`string_col`)'),
            (s.length(), 'length(`string_col`)'),
            (s.ascii_str(), 'ascii(`string_col`)')
        ]
        self._check_expr_cases(cases)

    def test_substr(self):
        # Database numbers starting from 1
        cases = [
            (self.table.string_col.substr(2), 'substr(`string_col`, 2 + 1)'),
            (self.table.string_col.substr(0, 3),
             'substr(`string_col`, 0 + 1, 3)')
        ]
        self._check_expr_cases(cases)

    def test_strright(self):
        cases = [
            (self.table.string_col.right(4), 'strright(`string_col`, 4)')
        ]
        self._check_expr_cases(cases)

    def test_like(self):
        cases = [
            (self.table.string_col.like('foo%'), "`string_col` LIKE 'foo%'")
        ]
        self._check_expr_cases(cases)

    def test_rlike(self):
        ex = "`string_col` RLIKE '[\d]+'"
        cases = [
            (self.table.string_col.rlike('[\d]+'), ex),
            (self.table.string_col.re_search('[\d]+'), ex),
        ]
        self._check_expr_cases(cases)

    def test_re_extract(self):
        sql = "regexp_extract(`string_col`, '[\d]+', 0)"
        cases = [
            (self.table.string_col.re_extract('[\d]+', 0), sql)
        ]
        self._check_expr_cases(cases)

    def test_re_replace(self):
        sql = "regexp_replace(`string_col`, '[\d]+', 'aaa')"
        cases = [
            (self.table.string_col.re_replace('[\d]+', 'aaa'), sql)
        ]
        self._check_expr_cases(cases)

    def test_parse_url(self):
        sql = "parse_url(`string_col`, 'HOST')"
        cases = [
            (self.table.string_col.parse_url('HOST'), sql)
        ]
        self._check_expr_cases(cases)

    def test_repeat(self):
        cases = [
            (self.table.string_col.repeat(2), 'repeat(`string_col`, 2)')
        ]
        self._check_expr_cases(cases)

    def test_translate(self):
        cases = [
            (self.table.string_col.translate('a', 'b'),
             "translate(`string_col`, 'a', 'b')")
        ]
        self._check_expr_cases(cases)

    def test_find(self):
        s = self.table.string_col
        i1 = self.table.tinyint_col
        cases = [
            (s.find('a'), "locate('a', `string_col`) - 1"),
            (s.find('a', 2), "locate('a', `string_col`, 3) - 1"),
            (s.find('a', start=i1),
             "locate('a', `string_col`, `tinyint_col` + 1) - 1")
        ]
        self._check_expr_cases(cases)

    def test_lpad(self):
        cases = [
            (self.table.string_col.lpad(1, 'a'), "lpad(`string_col`, 1, 'a')"),
            (self.table.string_col.lpad(25), "lpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_rpad(self):
        cases = [
            (self.table.string_col.rpad(1, 'a'), "rpad(`string_col`, 1, 'a')"),
            (self.table.string_col.rpad(25), "rpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_find_in_set(self):
        cases = [
            (self.table.string_col.find_in_set(['a']),
             "find_in_set(`string_col`, 'a') - 1"),
            (self.table.string_col.find_in_set(['a', 'b']),
             "find_in_set(`string_col`, 'a,b') - 1")
        ]
        self._check_expr_cases(cases)

    def test_string_join(self):
        cases = [
            (L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
        ]
        self._check_expr_cases(cases)
Пример #31
0
class TestInsertLoadData(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_load_data_unpartitioned(self):
        path = '/path/to/data'
        stmt = ddl.LoadData('functional_alltypes', path, database='foo')

        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "INTO TABLE foo.`functional_alltypes`")
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "OVERWRITE INTO TABLE foo.`functional_alltypes`")
        assert result == expected

    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes', path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Пример #32
0
class TestAnalytics(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')

    def test_category_project(self):
        t = self.alltypes

        tier = t.double_col.bucket([0, 50, 100]).name('tier')
        expr = t[tier, t]

        assert isinstance(expr.tier, ir.CategoryArray)

    def test_bucket(self):
        d = self.alltypes.double_col
        bins = [0, 10, 50, 100]

        expr = d.bucket(bins)
        assert isinstance(expr, ir.CategoryArray)
        assert expr.op().nbuckets == 3

        expr = d.bucket(bins, include_over=True)
        assert expr.op().nbuckets == 4

        expr = d.bucket(bins, include_over=True, include_under=True)
        assert expr.op().nbuckets == 5

    def test_bucket_error_cases(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.bucket, [])
        self.assertRaises(ValueError, d.bucket, [1, 2], closed='foo')

        # it works!
        d.bucket([10], include_under=True, include_over=True)

        self.assertRaises(ValueError, d.bucket, [10])
        self.assertRaises(ValueError, d.bucket, [10], include_under=True)
        self.assertRaises(ValueError, d.bucket, [10], include_over=True)

    def test_histogram(self):
        d = self.alltypes.double_col

        self.assertRaises(ValueError, d.histogram, nbins=10, binwidth=5)
        self.assertRaises(ValueError, d.histogram)
        self.assertRaises(ValueError, d.histogram, 10, closed='foo')

    def test_topk_analysis_bug(self):
        # GH #398
        airlines = ibis.table([('dest', 'string'),
                               ('origin', 'string'),
                               ('arrdelay', 'int32')],
                              'airlines')
        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.origin.topk(10, by=t.arrdelay.mean())

        filtered = t.filter([delay_filter])

        post_pred = filtered.op().predicates[1]
        assert delay_filter.to_filter().equals(post_pred)

    def test_topk_function_late_bind(self):
        # GH #520
        airlines = ibis.table([('dest', 'string'),
                               ('origin', 'string'),
                               ('arrdelay', 'int32')],
                              'airlines')
        expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean())
        expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean())

        assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
Пример #33
0
class UDFTest(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("functional_alltypes")

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table("tpch_customer").c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d, self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        op = udf.scalar_function(["string"], "string", name="Tester")
        udf.add_operation(op, "identity", "udf_testing")

        def _identity_test(value):
            return op(value).to_expr()

        result = _identity_test("hello world")
        assert result == "SELECT udf_testing.identity('hello world')"

    def test_sql_generation_from_infoclass(self):
        udf_info = udf.UDFCreator("test.so", ["string"], "string", "info_test")
        repr(udf_info)
        op = udf_info.to_operation()
        udf.add_operation(op, "info_test", "udf_testing")
        assert op in _operation_registry

        def _infoclass_test(value):
            return op(value).to_expr()

        result = _infoclass_test("hello world")

        assert result == "SELECT udf_testing.info_test('hello world')"

    def test_boolean(self):
        func = self._udf_registration_single_input("boolean", "boolean", "test")
        expr = func(True)
        assert type(expr) == ir.BooleanScalar
        expr = func(self.b)
        assert type(expr) == ir.BooleanArray

    def test_tinyint(self):
        func = self._udf_registration_single_input("int8", "int8", "test")
        expr = func(1)
        assert type(expr) == ir.Int8Scalar
        expr = func(self.i8)
        assert type(expr) == ir.Int8Array

    def test_smallint(self):
        func = self._udf_registration_single_input("int16", "int16", "test")
        expr = func(1)
        assert type(expr) == ir.Int16Scalar
        expr = func(self.i16)
        assert type(expr) == ir.Int16Array

    def test_int(self):
        func = self._udf_registration_single_input("int32", "int32", "test")
        expr = func(1)
        assert type(expr) == ir.Int32Scalar
        expr = func(self.i32)
        assert type(expr) == ir.Int32Array

    def test_bigint(self):
        func = self._udf_registration_single_input("int64", "int64", "test")
        expr = func(1)
        assert type(expr) == ir.Int64Scalar
        expr = func(self.i64)
        assert type(expr) == ir.Int64Array

    def test_float(self):
        func = self._udf_registration_single_input("float", "float", "test")
        expr = func(1.0)
        assert type(expr) == ir.FloatScalar
        expr = func(self.f)
        assert type(expr) == ir.FloatArray

    def test_double(self):
        func = self._udf_registration_single_input("double", "double", "test")
        expr = func(1.0)
        assert type(expr) == ir.DoubleScalar
        expr = func(self.d)
        assert type(expr) == ir.DoubleArray

    def test_decimal(self):
        func = self._udf_registration_single_input("decimal(9,0)", "decimal(9,0)", "test")
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalArray

    def test_string(self):
        func = self._udf_registration_single_input("string", "string", "test")
        expr = func("1")
        assert type(expr) == ir.StringScalar
        expr = func(self.s)
        assert type(expr) == ir.StringArray

    def test_timestamp(self):
        func = self._udf_registration_single_input("timestamp", "timestamp", "test")
        expr = func(ibis.timestamp("1961-04-10"))
        assert type(expr) == ir.TimestampScalar
        expr = func(self.t)
        assert type(expr) == ir.TimestampArray

    def test_invalid_typecasting_tinyint(self):
        self._invalid_typecasts("int8", self.all_cols[1:])

    def test_invalid_typecasting_smallint(self):
        self._invalid_typecasts("int16", self.all_cols[2:])

    def test_invalid_typecasting_int(self):
        self._invalid_typecasts("int32", self.all_cols[3:])

    def test_invalid_typecasting_bigint(self):
        self._invalid_typecasts("int64", self.all_cols[4:])

    def test_invalid_typecasting_boolean(self):
        self._invalid_typecasts("boolean", self.all_cols[:8] + self.all_cols[9:])

    def test_invalid_typecasting_float(self):
        self._invalid_typecasts("float", self.all_cols[:4] + self.all_cols[6:])

    def test_invalid_typecasting_double(self):
        self._invalid_typecasts("double", self.all_cols[:4] + self.all_cols[6:])

    def test_invalid_typecasting_string(self):
        self._invalid_typecasts("string", self.all_cols[:7] + self.all_cols[8:])

    def test_invalid_typecasting_timestamp(self):
        self._invalid_typecasts("timestamp", self.all_cols[:-1])

    def test_invalid_typecasting_decimal(self):
        self._invalid_typecasts("decimal", self.all_cols[:4] + self.all_cols[7:])

    def test_mult_args(self):
        op = self._udf_registration(["int32", "double", "string", "boolean", "timestamp"], "int64", "mult_types")

        def _func(integer, double, string, boolean, timestamp):
            return op(integer, double, string, boolean, timestamp).to_expr()

        expr = _func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ArrayExpr)

        expr = _func(1, 1.0, "a", True, ibis.timestamp("1961-04-10"))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _udf_registration_single_input(self, inputs, output, name):
        op = self._udf_registration([inputs], output, name)

        def _test_func(value):
            return op(value).to_expr()

        return _test_func

    def _udf_registration(self, inputs, output, name):
        op = udf.scalar_function(inputs, output, name=name)
        assert issubclass(op, ValueOp)
        udf.add_operation(op, name, "ibis_testing")
        return op

    def _invalid_typecasts(self, inputs, invalid_casts):
        func = self._udf_registration_single_input(inputs, "int32", "typecast")
        for in_type in invalid_casts:
            self.assertRaises(IbisTypeError, func, in_type)
Пример #34
0
class TestFixedOffsets(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_upconvert(self):
        cases = [
            (T.day(14), 'w', T.week(2)),
            (T.hour(72), 'd', T.day(3)),
            (T.minute(240), 'h', T.hour(4)),
            (T.second(360), 'm', T.minute(6)),
            (T.second(3 * 86400), 'd', T.day(3)),
            (T.millisecond(5000), 's', T.second(5)),
            (T.microsecond(5000000), 's', T.second(5)),
            (T.nanosecond(5000000000), 's', T.second(5)),
        ]

        for offset, unit, expected in cases:
            result = offset.to_unit(unit)
            assert result.equals(expected)

    def test_multiply(self):
        offset = T.day(2)

        assert (offset * 2).equals(T.day(4))
        assert (offset * (-2)).equals(T.day(-4))
        assert (3 * offset).equals(T.day(6))
        assert ((-3) * offset).equals(T.day(-6))

    def test_repr(self):
        assert repr(T.day()) == '<Timedelta: 1 day>'
        assert repr(T.day(2)) == '<Timedelta: 2 days>'
        assert repr(T.year()) == '<Timedelta: 1 year>'
        assert repr(T.month(2)) == '<Timedelta: 2 months>'
        assert repr(T.second(40)) == '<Timedelta: 40 seconds>'

    def test_cannot_upconvert(self):
        cases = [
            (T.day(), 'w'),
            (T.hour(), 'd'),
            (T.minute(), 'h'),
            (T.second(), 'm'),
            (T.second(), 'd'),
            (T.millisecond(), 's'),
            (T.microsecond(), 's'),
            (T.nanosecond(), 's'),
        ]

        for delta, target in cases:
            self.assertRaises(IbisError, delta.to_unit, target)

    def test_downconvert_second_parts(self):
        K = 2

        sec = T.second(K)
        milli = T.millisecond(K)
        micro = T.microsecond(K)
        nano = T.nanosecond(K)

        cases = [
            (sec.to_unit('s'), T.second(K)),
            (sec.to_unit('ms'), T.millisecond(K * 1000)),
            (sec.to_unit('us'), T.microsecond(K * 1000000)),
            (sec.to_unit('ns'), T.nanosecond(K * 1000000000)),

            (milli.to_unit('ms'), T.millisecond(K)),
            (milli.to_unit('us'), T.microsecond(K * 1000)),
            (milli.to_unit('ns'), T.nanosecond(K * 1000000)),

            (micro.to_unit('us'), T.microsecond(K)),
            (micro.to_unit('ns'), T.nanosecond(K * 1000)),

            (nano.to_unit('ns'), T.nanosecond(K))
        ]
        self._check_cases(cases)

    def test_downconvert_hours(self):
        K = 2
        offset = T.hour(K)

        cases = [
            (offset.to_unit('h'), T.hour(K)),
            (offset.to_unit('m'), T.minute(K * 60)),
            (offset.to_unit('s'), T.second(K * 3600)),
            (offset.to_unit('ms'), T.millisecond(K * 3600000)),
            (offset.to_unit('us'), T.microsecond(K * 3600000000)),
            (offset.to_unit('ns'), T.nanosecond(K * 3600000000000))
        ]
        self._check_cases(cases)

    def test_downconvert_day(self):
        K = 2

        week = T.week(K)
        day = T.day(K)

        cases = [
            (week.to_unit('d'), T.day(K * 7)),
            (week.to_unit('h'), T.hour(K * 7 * 24)),

            (day.to_unit('d'), T.day(K)),
            (day.to_unit('h'), T.hour(K * 24)),
            (day.to_unit('m'), T.minute(K * 1440)),
            (day.to_unit('s'), T.second(K * 86400)),
            (day.to_unit('ms'), T.millisecond(K * 86400000)),
            (day.to_unit('us'), T.microsecond(K * 86400000000)),
            (day.to_unit('ns'), T.nanosecond(K * 86400000000000))
        ]
        self._check_cases(cases)

    def test_combine_with_different_kinds(self):
        cases = [
            (T.day() + T.minute(), T.minute(1441)),
            (T.second() + T.millisecond(10), T.millisecond(1010)),
            (T.hour() + T.minute(5) + T.second(10), T.second(3910))
        ]
        self._check_cases(cases)

    def test_timedelta_generic_api(self):
        cases = [
            (T.timedelta(weeks=2), T.week(2)),
            (T.timedelta(days=3), T.day(3)),
            (T.timedelta(hours=4), T.hour(4)),
            (T.timedelta(minutes=5), T.minute(5)),
            (T.timedelta(seconds=6), T.second(6)),
            (T.timedelta(milliseconds=7), T.millisecond(7)),
            (T.timedelta(microseconds=8), T.microsecond(8)),
            (T.timedelta(nanoseconds=9), T.nanosecond(9)),
        ]
        self._check_cases(cases)

    def _check_cases(self, cases):
        for x, y in cases:
            assert x.equals(y)

    def test_offset_timestamp_expr(self):
        c = self.table.i
        x = T.timedelta(days=1)

        expr = x + c
        assert isinstance(expr, ir.TimestampArray)
        assert isinstance(expr.op(), ops.TimestampDelta)

        # test radd
        expr = c + x
        assert isinstance(expr, ir.TimestampArray)
        assert isinstance(expr.op(), ops.TimestampDelta)
Пример #35
0
class TestDecimal(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.lineitem = self.con.table('tpch_lineitem')

    def test_type_metadata(self):
        col = self.lineitem.l_extendedprice
        assert isinstance(col, ir.DecimalArray)

        assert col._precision == 12
        assert col._scale == 2

    def test_cast_scalar_to_decimal(self):
        val = api.literal('1.2345')

        casted = val.cast('decimal(15,5)')
        assert isinstance(casted, ir.DecimalScalar)
        assert casted._precision == 15
        assert casted._scale == 5

    def test_decimal_aggregate_function_behavior(self):
        # From the Impala documentation: "The result of an aggregate function
        # such as MAX(), SUM(), or AVG() on DECIMAL values is promoted to a
        # scale of 38, with the same precision as the underlying column. Thus,
        # the result can represent the largest possible value at that
        # particular precision."
        col = self.lineitem.l_extendedprice
        functions = ['sum', 'mean', 'max', 'min']

        for func_name in functions:
            result = getattr(col, func_name)()
            assert isinstance(result, ir.DecimalScalar)
            assert result._precision == col._precision
            assert result._scale == 38

    def test_where(self):
        table = self.lineitem

        q = table.l_quantity
        expr = api.where(table.l_discount > 0,
                         q * table.l_discount, api.null)

        assert isinstance(expr, ir.DecimalArray)

        expr = api.where(table.l_discount > 0,
                         (q * table.l_discount).sum(), api.null)
        assert isinstance(expr, ir.DecimalArray)

        expr = api.where(table.l_discount.sum() > 0,
                         (q * table.l_discount).sum(), api.null)
        assert isinstance(expr, ir.DecimalScalar)

    def test_fillna(self):
        expr = self.lineitem.l_extendedprice.fillna(0)
        assert isinstance(expr, ir.DecimalArray)

        expr = self.lineitem.l_extendedprice.fillna(
            self.lineitem.l_quantity)
        assert isinstance(expr, ir.DecimalArray)

    def test_precision_scale(self):
        col = self.lineitem.l_extendedprice

        p = col.precision()
        s = col.scale()

        assert isinstance(p, ir.IntegerValue)
        assert isinstance(p.op(), ops.DecimalPrecision)

        assert isinstance(s, ir.IntegerValue)
        assert isinstance(s.op(), ops.DecimalScale)

    def test_invalid_precision_scale_combo(self):
        pass
Пример #36
0
class TestASTBuilder(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

    def test_ast_with_projection_join_filter(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0

        table3 = table[filter_pred]

        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])
        result = joined[[table3, table2['value']]]

        ast = build_ast(result)
        stmt = ast.queries[0]

        def foo():
            table3 = table[filter_pred]
            joined = table2.inner_join(table3, [join_pred])
            result = joined[[table3, table2['value']]]
            return result

        assert len(stmt.select_set) == 2
        assert len(stmt.where) == 1
        assert stmt.where[0] is filter_pred

        # Check that the join has been rebuilt to only include the root tables
        tbl = stmt.table_set
        tbl_node = tbl.op()
        assert isinstance(tbl_node, ops.InnerJoin)
        assert tbl_node.left is table2
        assert tbl_node.right is table

        # table expression substitution has been made in the predicate
        assert tbl_node.predicates[0].equals(table['g'] == table2['key'])

    def test_ast_with_aggregation_join_filter(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        ast = build_ast(result)
        stmt = ast.queries[0]

        # hoisted metrics
        ex_metrics = [(table['f'] - table2['value']).mean().name('foo'),
                      table['f'].sum().name('bar')]
        ex_by = [table['g'], table2['key']]

        # hoisted join and aggregate
        expected_table_set = \
            table2.inner_join(table, [table['g'] == table2['key']])
        assert stmt.table_set.equals(expected_table_set)

        # Check various exprs
        for res, ex in zip(stmt.select_set, ex_by + ex_metrics):
            assert res.equals(ex)

        for res, ex in zip(stmt.group_by, ex_by):
            assert stmt.select_set[res].equals(ex)

        # Check we got the filter
        assert len(stmt.where) == 1
        assert stmt.where[0].equals(filter_pred)

    def test_sort_by(self):
        table = self.con.table('star1')

        what = table.sort_by('f')
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `f`"""
        assert result == expected

        what = table.sort_by(('f', 0))
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `f` DESC"""
        assert result == expected

        what = table.sort_by(['c', ('f', 0)])
        result = to_sql(what)
        expected = """SELECT *
FROM star1
ORDER BY `c`, `f` DESC"""
        assert result == expected

    def test_limit(self):
        table = self.con.table('star1').limit(10)
        result = to_sql(table)
        expected = """SELECT *
FROM star1
LIMIT 10"""
        assert result == expected

        table = self.con.table('star1').limit(10, offset=5)
        result = to_sql(table)
        expected = """SELECT *
FROM star1
LIMIT 10 OFFSET 5"""
        assert result == expected

        # Put the limit in a couple places in the stack
        table = self.con.table('star1')
        table = table[table.f > 0].limit(10)
        result = to_sql(table)

        expected = """SELECT *
FROM star1
WHERE `f` > 0
LIMIT 10"""

        assert result == expected

        table = self.con.table('star1')

        # Semantically, this should produce a subquery
        table = table.limit(10)
        table = table[table.f > 0]

        result2 = to_sql(table)

        expected2 = """SELECT *
FROM (
  SELECT *
  FROM star1
  LIMIT 10
) t0
WHERE `f` > 0"""

        assert result2 == expected2

    def test_join_with_limited_table(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        limited = t1.limit(100)
        joined = (limited.inner_join(t2,
                                     [limited.foo_id == t2.foo_id])[[limited]])

        result = to_sql(joined)
        expected = """SELECT t0.*
FROM (
  SELECT *
  FROM star1
  LIMIT 100
) t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""

        assert result == expected

    def test_sort_by_on_limit_yield_subquery(self):
        # x.limit(...).sort_by(...)
        #   is semantically different from
        # x.sort_by(...).limit(...)
        #   and will often yield different results
        t = self.con.table('functional_alltypes')
        expr = (t.group_by('string_col').aggregate(
            [t.count().name('nrows')]).limit(5).sort_by('string_col'))

        result = to_sql(expr)
        expected = """SELECT *
FROM (
  SELECT `string_col`, count(*) AS `nrows`
  FROM functional_alltypes
  GROUP BY 1
  LIMIT 5
) t0
ORDER BY `string_col`"""
        assert result == expected

    def test_multiple_limits(self):
        t = self.con.table('functional_alltypes')

        expr = t.limit(20).limit(10)
        stmt = build_ast(expr).queries[0]

        assert stmt.limit['n'] == 10

    def test_top_convenience(self):
        # x.top(10, by=field)
        # x.top(10, by=[field1, field2])
        pass

    def test_self_aggregate_in_predicate(self):
        # Per ibis #43
        pass
Пример #37
0
class TestBucketHistogram(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_bucket_to_case(self):
        buckets = [0, 10, 25, 50]

        expr1 = self.table.f.bucket(buckets)
        expected1 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr2 = self.table.f.bucket(buckets, close_extreme=False)
        expected2 = """\
CASE
  WHEN (`f` >= 0) AND (`f` < 10) THEN 0
  WHEN (`f` >= 10) AND (`f` < 25) THEN 1
  WHEN (`f` >= 25) AND (`f` < 50) THEN 2
  ELSE NULL
END"""

        expr3 = self.table.f.bucket(buckets, closed='right')
        expected3 = """\
CASE
  WHEN (`f` >= 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr4 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False)
        expected4 = """\
CASE
  WHEN (`f` > 0) AND (`f` <= 10) THEN 0
  WHEN (`f` > 10) AND (`f` <= 25) THEN 1
  WHEN (`f` > 25) AND (`f` <= 50) THEN 2
  ELSE NULL
END"""

        expr5 = self.table.f.bucket(buckets, include_under=True)
        expected5 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr6 = self.table.f.bucket(buckets,
                                    include_under=True,
                                    include_over=True)
        expected6 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
  WHEN `f` > 50 THEN 4
  ELSE NULL
END"""

        expr7 = self.table.f.bucket(buckets,
                                    close_extreme=False,
                                    include_under=True,
                                    include_over=True)
        expected7 = """\
CASE
  WHEN `f` < 0 THEN 0
  WHEN (`f` >= 0) AND (`f` < 10) THEN 1
  WHEN (`f` >= 10) AND (`f` < 25) THEN 2
  WHEN (`f` >= 25) AND (`f` < 50) THEN 3
  WHEN `f` >= 50 THEN 4
  ELSE NULL
END"""

        expr8 = self.table.f.bucket(buckets, closed='right',
                                    close_extreme=False,
                                    include_under=True)
        expected8 = """\
CASE
  WHEN `f` <= 0 THEN 0
  WHEN (`f` > 0) AND (`f` <= 10) THEN 1
  WHEN (`f` > 10) AND (`f` <= 25) THEN 2
  WHEN (`f` > 25) AND (`f` <= 50) THEN 3
  ELSE NULL
END"""

        expr9 = self.table.f.bucket([10], closed='right',
                                    include_over=True,
                                    include_under=True)
        expected9 = """\
CASE
  WHEN `f` <= 10 THEN 0
  WHEN `f` > 10 THEN 1
  ELSE NULL
END"""

        expr10 = self.table.f.bucket([10], include_over=True,
                                     include_under=True)
        expected10 = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        cases = [
            (expr1, expected1),
            (expr2, expected2),
            (expr3, expected3),
            (expr4, expected4),
            (expr5, expected5),
            (expr6, expected6),
            (expr7, expected7),
            (expr8, expected8),
            (expr9, expected9),
            (expr10, expected10),
        ]
        self._check_expr_cases(cases)

    def test_cast_category_to_int_noop(self):
        # Because the bucket result is an integer, no explicit cast is
        # necessary
        expr = (self.table.f.bucket([10], include_over=True,
                                    include_under=True)
                .cast('int32'))

        expected = """\
CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END"""

        expr2 = (self.table.f.bucket([10], include_over=True,
                                     include_under=True)
                 .cast('double'))

        expected2 = """\
CAST(CASE
  WHEN `f` < 10 THEN 0
  WHEN `f` >= 10 THEN 1
  ELSE NULL
END AS double)"""

        self._check_expr_cases([(expr, expected),
                                (expr2, expected2)])

    def test_bucket_assign_labels(self):
        buckets = [0, 10, 25, 50]
        bucket = self.table.f.bucket(buckets, include_under=True)

        size = self.table.group_by(bucket.name('tier')).size()
        labelled = size.tier.label(['Under 0', '0 to 10',
                                    '10 to 25', '25 to 50'],
                                   nulls='error').name('tier2')
        expr = size[labelled, size['count']]

        expected = """\
SELECT
  CASE `tier`
    WHEN 0 THEN 'Under 0'
    WHEN 1 THEN '0 to 10'
    WHEN 2 THEN '10 to 25'
    WHEN 3 THEN '25 to 50'
    ELSE 'error'
  END AS `tier2`, `count`
FROM (
  SELECT
    CASE
      WHEN `f` < 0 THEN 0
      WHEN (`f` >= 0) AND (`f` < 10) THEN 1
      WHEN (`f` >= 10) AND (`f` < 25) THEN 2
      WHEN (`f` >= 25) AND (`f` <= 50) THEN 3
      ELSE NULL
    END AS `tier`, count(*) AS `count`
  FROM alltypes
  GROUP BY 1
) t0"""

        result = to_sql(expr)

        assert result == expected

        self.assertRaises(ValueError, size.tier.label, ['a', 'b', 'c'])
        self.assertRaises(ValueError, size.tier.label,
                          ['a', 'b', 'c', 'd', 'e'])
Пример #38
0
class TestCreateTable(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table("functional_alltypes")
        self.expr = t[t.bigint_col > 0]

    def test_create_external_table_as(self):
        path = "/path/to/table"
        select = build_ast(self.con.table("test1")).queries[0]
        statement = ddl.CTAS("another_table", select, external=True, can_exist=False, path=path, database="foo")
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
STORED AS PARQUET
LOCATION '{0}'
AS
SELECT *
FROM test1""".format(
            path
        )
        assert result == expected

    def test_create_table_with_location(self):
        path = "/path/to/table"
        schema = ibis.schema([("foo", "string"), ("bar", "int8"), ("baz", "int16")])
        statement = ddl.CreateTableWithSchema(
            "another_table", schema, ddl.NoFormat(), can_exist=False, path=path, database="foo"
        )
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
LOCATION '{0}'""".format(
            path
        )
        assert result == expected

    def test_create_table_like_parquet(self):
        directory = "/path/to/"
        path = "/path/to/parquetfile"
        statement = ddl.CreateTableParquet("new_table", directory, example_file=path, can_exist=True, database="foo")

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE PARQUET '{0}'
STORED AS PARQUET
LOCATION '{1}'""".format(
            path, directory
        )

        assert result == expected

    def test_create_table_parquet_like_other(self):
        # alternative to "LIKE PARQUET"
        directory = "/path/to/"
        example_table = "db.other"

        statement = ddl.CreateTableParquet(
            "new_table", directory, example_table=example_table, can_exist=True, database="foo"
        )

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE {0}
STORED AS PARQUET
LOCATION '{1}'""".format(
            example_table, directory
        )

        assert result == expected

    def test_create_table_parquet_with_schema(self):
        directory = "/path/to/"

        schema = ibis.schema([("foo", "string"), ("bar", "int8"), ("baz", "int16")])

        statement = ddl.CreateTableParquet(
            "new_table", directory, schema=schema, external=True, can_exist=True, database="foo"
        )

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(
            directory
        )

        assert result == expected

    def test_create_table_delimited(self):
        path = "/path/to/files/"
        schema = ibis.schema([("a", "string"), ("b", "int32"), ("c", "double"), ("d", "decimal(12,2)")])

        stmt = ddl.CreateTableDelimited(
            "new_table",
            path,
            schema,
            delimiter="|",
            escapechar="\\",
            lineterminator="\0",
            database="foo",
            can_exist=True,
        )

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(
            path
        )
        assert result == expected

    def test_create_external_table_avro(self):
        path = "/path/to/files/"

        avro_schema = {
            "fields": [
                {"name": "a", "type": "string"},
                {"name": "b", "type": "int"},
                {"name": "c", "type": "double"},
                {"type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2, "name": "d"},
            ],
            "name": "my_record",
            "type": "record",
        }

        stmt = ddl.CreateTableAvro("new_table", path, avro_schema, database="foo", can_exist=True)

        result = stmt.compile()
        expected = (
            """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
STORED AS AVRO
LOCATION '%s'
TBLPROPERTIES ('avro.schema.literal'='{
  "fields": [
    {
      "name": "a",
      "type": "string"
    },
    {
      "name": "b",
      "type": "int"
    },
    {
      "name": "c",
      "type": "double"
    },
    {
      "logicalType": "decimal",
      "name": "d",
      "precision": 4,
      "scale": 2,
      "type": "bytes"
    }
  ],
  "name": "my_record",
  "type": "record"
}')"""
            % path
        )
        assert result == expected

    def test_create_table_parquet(self):
        statement = _create_table("some_table", self.expr, database="bar", can_exist=False)
        result = statement.compile()

        expected = """\
CREATE TABLE bar.`some_table`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_no_overwrite(self):
        statement = _create_table("tname", self.expr, can_exist=True)
        result = statement.compile()

        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_avro_other_formats(self):
        statement = _create_table("tname", self.t, format="avro", can_exist=True)
        result = statement.compile()
        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS AVRO
AS
SELECT *
FROM functional_alltypes"""
        assert result == expected

        self.assertRaises(ValueError, _create_table, "tname", self.t, format="foo")

    def test_partition_by(self):
        pass
Пример #39
0
class TestNonTabularResults(unittest.TestCase):
    """

    """
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_simple_scalar_aggregates(self):
        from pandas import DataFrame

        # Things like table.column.{sum, mean, ...}()
        table = self.con.table('alltypes')

        expr = table[table.c > 0].f.sum()

        ast = build_ast(expr)
        query = ast.queries[0]

        sql_query = query.compile()
        expected = """SELECT sum(`f`) AS `tmp`
FROM alltypes
WHERE `c` > 0"""

        assert sql_query == expected

        # Maybe the result handler should act on the cursor. Not sure.
        handler = query.result_handler
        output = DataFrame({'tmp': [5]})
        assert handler(output) == 5

    def test_table_column_unbox(self):
        from pandas import DataFrame

        table = self.table
        m = table.f.sum().name('total')
        agged = table[table.c > 0].group_by('g').aggregate([m])
        expr = agged.g

        ast = build_ast(expr)
        query = ast.queries[0]

        sql_query = query.compile()
        expected = """SELECT `g`, sum(`f`) AS `total`
FROM alltypes
WHERE `c` > 0
GROUP BY 1"""

        assert sql_query == expected

        # Maybe the result handler should act on the cursor. Not sure.
        handler = query.result_handler
        output = DataFrame({'g': ['foo', 'bar', 'baz']})
        assert (handler(output) == output['g']).all()

    def test_complex_array_expr_projection(self):
        # May require finding the base table and forming a projection.
        expr = (self.table.group_by('g').aggregate(
            [self.table.count().name('count')]))
        expr2 = expr.g.cast('double')

        query = to_sql(expr2)
        expected = """SELECT CAST(`g` AS double) AS `tmp`
FROM (
  SELECT `g`, count(*) AS `count`
  FROM alltypes
  GROUP BY 1
) t0"""
        assert query == expected

    def test_scalar_exprs_no_table_refs(self):
        expr1 = ibis.now()
        expected1 = """\
SELECT now() AS `tmp`"""

        expr2 = ibis.literal(1) + ibis.literal(2)
        expected2 = """\
SELECT 1 + 2 AS `tmp`"""

        cases = [(expr1, expected1), (expr2, expected2)]

        for expr, expected in cases:
            result = to_sql(expr)
            assert result == expected

    def test_expr_list_no_table_refs(self):
        exlist = ibis.api.expr_list([
            ibis.literal(1).name('a'),
            ibis.now().name('b'),
            ibis.literal(2).log().name('c')
        ])
        result = to_sql(exlist)
        expected = """\
SELECT 1 AS `a`, now() AS `b`, ln(2) AS `c`"""
        assert result == expected

    def test_isnull_case_expr_rewrite_failure(self):
        # #172, case expression that was not being properly converted into an
        # aggregation
        reduction = self.table.g.isnull().ifelse(1, 0).sum()

        result = to_sql(reduction)
        expected = """\
SELECT sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END) AS `tmp`
FROM alltypes"""
        assert result == expected
Пример #40
0
class TestCreateTable(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

        self.t = t = self.con.table('functional_alltypes')
        self.expr = t[t.bigint_col > 0]

    def test_create_external_table_as(self):
        path = '/path/to/table'
        select = build_ast(self.con.table('test1')).queries[0]
        statement = ddl.CTAS('another_table',
                             select,
                             external=True,
                             can_exist=False,
                             path=path,
                             database='foo')
        result = statement.compile()

        expected = """\
CREATE EXTERNAL TABLE foo.`another_table`
STORED AS PARQUET
LOCATION '{0}'
AS
SELECT *
FROM test1""".format(path)
        assert result == expected

    def test_create_table_with_location(self):
        path = '/path/to/table'
        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])
        statement = ddl.CreateTableWithSchema('another_table', schema,
                                              ddl.NoFormat(),
                                              can_exist=False,
                                              path=path, database='foo')
        result = statement.compile()

        expected = """\
CREATE TABLE foo.`another_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_table_like_parquet(self):
        directory = '/path/to/'
        path = '/path/to/parquetfile'
        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_file=path,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE PARQUET '{0}'
STORED AS PARQUET
LOCATION '{1}'""".format(path, directory)

        assert result == expected

    def test_create_table_parquet_like_other(self):
        # alternative to "LIKE PARQUET"
        directory = '/path/to/'
        example_table = 'db.other'

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           example_table=example_table,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE {0}
STORED AS PARQUET
LOCATION '{1}'""".format(example_table, directory)

        assert result == expected

    def test_create_table_parquet_with_schema(self):
        directory = '/path/to/'

        schema = ibis.schema([('foo', 'string'),
                              ('bar', 'int8'),
                              ('baz', 'int16')])

        statement = ddl.CreateTableParquet('new_table',
                                           directory,
                                           schema=schema,
                                           external=True,
                                           can_exist=True,
                                           database='foo')

        result = statement.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

        assert result == expected

    def test_create_table_delimited(self):
        path = '/path/to/files/'
        schema = ibis.schema([('a', 'string'),
                              ('b', 'int32'),
                              ('c', 'double'),
                              ('d', 'decimal(12,2)')])

        stmt = ddl.CreateTableDelimited('new_table', path, schema,
                                        delimiter='|',
                                        escapechar='\\',
                                        lineterminator='\0',
                                        database='foo',
                                        can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`a` string,
 `b` int,
 `c` double,
 `d` decimal(12,2))
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
ESCAPED BY '\\'
LINES TERMINATED BY '\0'
LOCATION '{0}'""".format(path)
        assert result == expected

    def test_create_external_table_avro(self):
        path = '/path/to/files/'

        avro_schema = {
            'fields': [
                {'name': 'a', 'type': 'string'},
                {'name': 'b', 'type': 'int'},
                {'name': 'c', 'type': 'double'},
                {"type": "bytes",
                 "logicalType": "decimal",
                 "precision": 4,
                 "scale": 2,
                 'name': 'd'}
            ],
            'name': 'my_record',
            'type': 'record'
        }

        stmt = ddl.CreateTableAvro('new_table', path, avro_schema,
                                   database='foo', can_exist=True)

        result = stmt.compile()
        expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
STORED AS AVRO
LOCATION '%s'
TBLPROPERTIES ('avro.schema.literal'='{
  "fields": [
    {
      "name": "a",
      "type": "string"
    },
    {
      "name": "b",
      "type": "int"
    },
    {
      "name": "c",
      "type": "double"
    },
    {
      "logicalType": "decimal",
      "name": "d",
      "precision": 4,
      "scale": 2,
      "type": "bytes"
    }
  ],
  "name": "my_record",
  "type": "record"
}')""" % path
        assert result == expected

    def test_create_table_parquet(self):
        statement = _create_table('some_table', self.expr,
                                  database='bar',
                                  can_exist=False)
        result = statement.compile()

        expected = """\
CREATE TABLE bar.`some_table`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_no_overwrite(self):
        statement = _create_table('tname', self.expr, can_exist=True)
        result = statement.compile()

        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS PARQUET
AS
SELECT *
FROM functional_alltypes
WHERE `bigint_col` > 0"""
        assert result == expected

    def test_avro_other_formats(self):
        statement = _create_table('tname', self.t, format='avro',
                                  can_exist=True)
        result = statement.compile()
        expected = """\
CREATE TABLE IF NOT EXISTS `tname`
STORED AS AVRO
AS
SELECT *
FROM functional_alltypes"""
        assert result == expected

        self.assertRaises(ValueError, _create_table, 'tname', self.t,
                          format='foo')

    def test_partition_by(self):
        pass
Пример #41
0
class TestBuiltins(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')
        self.lineitem = self.con.table('tpch_lineitem')

    def test_abs(self):
        colnames = ['tinyint_col', 'smallint_col', 'int_col', 'bigint_col',
                    'float_col', 'double_col']

        fname = 'abs'
        op = ops.Abs

        for col in colnames:
            expr = self.alltypes[col]
            self._check_unary_op(expr, fname, op, type(expr))

        expr = self.lineitem.l_extendedprice
        self._check_unary_op(expr, fname, op, type(expr))

    def test_group_concat(self):
        col = self.alltypes.string_col

        expr = col.group_concat()
        assert isinstance(expr.op(), ops.GroupConcat)
        arg, sep = expr.op().args
        assert sep == ','

        expr = col.group_concat('|')
        arg, sep = expr.op().args
        assert sep == '|'

    def test_zeroifnull(self):
        dresult = self.alltypes.double_col.zeroifnull()
        iresult = self.alltypes.int_col.zeroifnull()

        assert type(dresult.op()) == ops.ZeroIfNull
        assert type(dresult) == ir.DoubleArray

        # Impala upconverts all ints to bigint. Hmm.
        assert type(iresult) == type(iresult)

    def test_fillna(self):
        result = self.alltypes.double_col.fillna(5)
        assert isinstance(result, ir.DoubleArray)

        assert isinstance(result.op(), ops.IfNull)

        result = self.alltypes.bool_col.fillna(True)
        assert isinstance(result, ir.BooleanArray)

        # Retains type of caller (for now)
        result = self.alltypes.int_col.fillna(self.alltypes.bigint_col)
        assert isinstance(result, ir.Int32Array)

    def test_ceil_floor(self):
        cresult = self.alltypes.double_col.ceil()
        fresult = self.alltypes.double_col.floor()
        assert isinstance(cresult, ir.Int64Array)
        assert isinstance(fresult, ir.Int64Array)
        assert type(cresult.op()) == ops.Ceil
        assert type(fresult.op()) == ops.Floor

        cresult = api.literal(1.2345).ceil()
        fresult = api.literal(1.2345).floor()
        assert isinstance(cresult, ir.Int64Scalar)
        assert isinstance(fresult, ir.Int64Scalar)

        dec_col = self.lineitem.l_extendedprice
        cresult = dec_col.ceil()
        fresult = dec_col.floor()
        assert isinstance(cresult, ir.DecimalArray)
        assert cresult.meta == dec_col.meta

        assert isinstance(fresult, ir.DecimalArray)
        assert fresult.meta == dec_col.meta

    def test_sign(self):
        result = self.alltypes.double_col.sign()
        assert isinstance(result, ir.FloatArray)
        assert type(result.op()) == ops.Sign

        result = api.literal(1.2345).sign()
        assert isinstance(result, ir.FloatScalar)

        dec_col = self.lineitem.l_extendedprice
        result = dec_col.sign()
        assert isinstance(result, ir.FloatArray)

    def test_round(self):
        result = self.alltypes.double_col.round()
        assert isinstance(result, ir.Int64Array)
        assert result.op().args[1] is None

        result = self.alltypes.double_col.round(2)
        assert isinstance(result, ir.DoubleArray)
        assert result.op().args[1] == 2

        # Even integers are double (at least in Impala, check with other DB
        # implementations)
        result = self.alltypes.int_col.round(2)
        assert isinstance(result, ir.DoubleArray)

        dec = self.lineitem.l_extendedprice
        result = dec.round()
        assert isinstance(result, ir.DecimalArray)

        result = dec.round(2)
        assert isinstance(result, ir.DecimalArray)

        result = api.literal(1.2345).round()
        assert isinstance(result, ir.Int64Scalar)

    def _check_unary_op(self, expr, fname, ex_op, ex_type):
        result = getattr(expr, fname)()
        assert type(result.op()) == ex_op
        assert type(result) == ex_type
Пример #42
0
class TestStringOps(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_lower_upper(self):
        lresult = self.table.g.lower()
        uresult = self.table.g.upper()

        assert isinstance(lresult, ir.StringArray)
        assert isinstance(uresult, ir.StringArray)

        assert isinstance(lresult.op(), ops.Lowercase)
        assert isinstance(uresult.op(), ops.Uppercase)

        lit = literal('FoO')

        lresult = lit.lower()
        uresult = lit.upper()
        assert isinstance(lresult, ir.StringScalar)
        assert isinstance(uresult, ir.StringScalar)

    def test_substr(self):
        lit = literal('FoO')

        result = self.table.g.substr(2, 4)
        lit_result = lit.substr(0, 2)

        assert isinstance(result, ir.StringArray)
        assert isinstance(lit_result, ir.StringScalar)

        op = result.op()
        assert isinstance(op, ops.Substring)

        start, length = op.args[1:]

        assert start.equals(literal(2))
        assert length.equals(literal(4))

    def test_left_right(self):
        result = self.table.g.left(5)
        expected = self.table.g.substr(0, 5)
        assert result.equals(expected)

        result = self.table.g.right(5)
        op = result.op()
        assert isinstance(op, ops.StrRight)
        assert op.args[1].equals(literal(5))

    def test_length(self):
        lit = literal('FoO')
        result = self.table.g.length()
        lit_result = lit.length()

        assert isinstance(result, ir.Int32Array)
        assert isinstance(lit_result, ir.Int32Scalar)
        assert isinstance(result.op(), ops.StringLength)

    def test_join(self):
        dash = literal('-')

        expr = dash.join([self.table.f.cast('string'),
                          self.table.g])
        assert isinstance(expr, ir.StringArray)

        expr = dash.join([literal('ab'), literal('cd')])
        assert isinstance(expr, ir.StringScalar)

    def test_contains(self):
        expr = self.table.g.contains('foo')
        expected = self.table.g.find('foo') >= 0
        assert_equal(expr, expected)

        self.assertRaises(Exception, lambda: 'foo' in self.table.g)

    def test_getitem_slice(self):
        cases = [
            (self.table.g[:3], self.table.g.substr(0, 3)),
            (self.table.g[2:6], self.table.g.substr(2, 4)),
        ]

        for case, expected in cases:
            assert_equal(case, expected)
Пример #43
0
class TestTimestamp(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('alltypes')
        self.col = self.alltypes.i

    def test_field_select(self):
        assert isinstance(self.col, ir.TimestampArray)

    def test_string_cast_to_timestamp(self):
        casted = self.alltypes.g.cast('timestamp')
        assert isinstance(casted, ir.TimestampArray)

        string = api.literal('2000-01-01')
        casted = string.cast('timestamp')
        assert isinstance(casted, ir.TimestampScalar)

    def test_extract_fields(self):
        # type-size may be database specific
        cases = [
            ('year', ops.ExtractYear, ir.Int32Array),
            ('month', ops.ExtractMonth, ir.Int32Array),
            ('day', ops.ExtractDay, ir.Int32Array),
            ('hour', ops.ExtractHour, ir.Int32Array),
            ('minute', ops.ExtractMinute, ir.Int32Array),
            ('second', ops.ExtractSecond, ir.Int32Array)
        ]

        for attr, ex_op, ex_type in cases:
            result = getattr(self.col, attr)()
            assert isinstance(result, ex_type)
            assert isinstance(result.op(), ex_op)

    def test_extract_no_propagate_name(self):
        # see #146
        table = self.con.table('functional_alltypes')

        expr = table.timestamp_col.hour()
        self.assertRaises(com.ExpressionError, expr.get_name)

    def test_now(self):
        result = api.now()
        assert isinstance(result, ir.TimestampScalar)
        assert isinstance(result.op(), ops.TimestampNow)

    def test_timestamp_literals(self):
        ts_str = '2015-01-01 00:00:00'
        val = pd.Timestamp(ts_str)

        expr = ibis.literal(val)
        assert isinstance(expr, ir.TimestampScalar)

        expr = ibis.timestamp(ts_str)
        assert isinstance(expr, ir.TimestampScalar)

        self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71')

    def test_integer_to_timestamp(self):
        # #246
        pass

    def test_comparison_timestamp(self):
        expr = self.col > (self.col.min() + ibis.day(3))
        assert isinstance(expr, ir.BooleanArray)

    def test_comparisons_string(self):
        val = '2015-01-01 00:00:00'
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)

        expr2 = val < self.col
        op = expr2.op()
        assert isinstance(op, ops.Greater)
        assert isinstance(op.right, ir.TimestampScalar)

    def test_comparisons_pandas_timestamp(self):
        val = pd.Timestamp('2015-01-01 00:00:00')
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)
Пример #44
0
class TestInsertLoadData(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.t = self.con.table('functional_alltypes')

    def test_select_basics(self):
        name = 'testing123456'

        expr = self.t.limit(10)
        select, _ = _get_select(expr)

        stmt = ddl.InsertSelect(name, select, database='foo')
        result = stmt.compile()

        expected = """\
INSERT INTO foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

        stmt = ddl.InsertSelect(name, select, database='foo', overwrite=True)
        result = stmt.compile()

        expected = """\
INSERT OVERWRITE foo.`testing123456`
SELECT *
FROM functional_alltypes
LIMIT 10"""
        assert result == expected

    def test_load_data_unpartitioned(self):
        path = '/path/to/data'
        stmt = ddl.LoadData('functional_alltypes', path, database='foo')

        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "INTO TABLE foo.`functional_alltypes`")
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = ("LOAD DATA INPATH '/path/to/data' "
                    "OVERWRITE INTO TABLE foo.`functional_alltypes`")
        assert result == expected

    def test_load_data_partitioned(self):
        path = '/path/to/data'
        part = {'year': 2007, 'month': 7}
        part_schema = ibis.schema([('year', 'int32'), ('month', 'int32')])
        stmt = ddl.LoadData('functional_alltypes',
                            path,
                            database='foo',
                            partition=part,
                            partition_schema=part_schema)

        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

        stmt.overwrite = True
        result = stmt.compile()
        expected = """\
LOAD DATA INPATH '/path/to/data' OVERWRITE INTO TABLE foo.`functional_alltypes`
PARTITION (year=2007, month=7)"""
        assert result == expected

    def test_select_overwrite(self):
        pass
Пример #45
0
class TestCaseExprs(unittest.TestCase, ExprSQLTest, ExprTestCases):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_isnull_1_0(self):
        expr = self.table.g.isnull().ifelse(1, 0)

        result = self._translate(expr)
        expected = 'CASE WHEN `g` IS NULL THEN 1 ELSE 0 END'
        assert result == expected

        # inside some other function
        result = self._translate(expr.sum())
        expected = 'sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END)'
        assert result == expected

    def test_simple_case(self):
        expr = self._case_simple_case()
        result = self._translate(expr)
        expected = """CASE `g`
  WHEN 'foo' THEN 'bar'
  WHEN 'baz' THEN 'qux'
  ELSE 'default'
END"""
        assert result == expected

    def test_search_case(self):
        expr = self._case_search_case()
        result = self._translate(expr)
        expected = """CASE
  WHEN `f` > 0 THEN `d` * 2
  WHEN `c` < 0 THEN `a` * 2
  ELSE NULL
END"""
        assert result == expected

    def test_where_use_if(self):
        expr = ibis.where(self.table.f > 0, self.table.e, self.table.a)
        assert isinstance(expr, ir.FloatValue)

        result = self._translate(expr)
        expected = "if(`f` > 0, `e`, `a`)"
        assert result == expected

    def test_nullif_ifnull(self):
        table = self.con.table('tpch_lineitem')

        f = table.l_quantity

        cases = [
            (f.nullif(f == 0),
             'nullif(`l_quantity`, `l_quantity` = 0)'),
            (f.fillna(0),
             'isnull(`l_quantity`, CAST(0 AS decimal(12,2)))'),
        ]
        self._check_expr_cases(cases)

    def test_decimal_fillna_cast_arg(self):
        table = self.con.table('tpch_lineitem')
        f = table.l_extendedprice

        cases = [
            (f.fillna(0),
             'isnull(`l_extendedprice`, CAST(0 AS decimal(12,2)))'),
            (f.fillna(0.0), 'isnull(`l_extendedprice`, 0.0)'),
        ]
        self._check_expr_cases(cases)
Пример #46
0
class TestExprFormatting(unittest.TestCase):
    # Uncertain about how much we want to commit to unit tests around the
    # particulars of the output at the moment.

    def setUp(self):
        self.schema = [
            ('a', 'int8'),
            ('b', 'int16'),
            ('c', 'int32'),
            ('d', 'int64'),
            ('e', 'float'),
            ('f', 'double'),
            ('g', 'string'),
            ('h', 'boolean')
        ]
        self.schema_dict = dict(self.schema)
        self.table = ibis.table(self.schema)
        self.con = MockConnection()

    def test_format_table_column(self):
        # GH #507
        result = repr(self.table.f)
        assert 'Column[array(double)]' in result

    def test_format_projection(self):
        # This should produce a ref to the projection
        proj = self.table[['c', 'a', 'f']]
        repr(proj['a'])

    def test_table_type_output(self):
        foo = ibis.table(
            [
                ('job', 'string'),
                ('dept_id', 'string'),
                ('year', 'int32'),
                ('y', 'double')
            ], 'foo')

        expr = foo.dept_id == foo.view().dept_id
        result = repr(expr)
        assert 'SelfReference[table]' in result
        assert 'UnboundTable[table]' in result

    def test_memoize_aggregate_correctly(self):
        table = self.table

        agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis')
        agg_exprs = [table['a'].sum().name('sum(a)'),
                     table['b'].mean().name('mean(b)'), agg_expr]

        result = table.aggregate(agg_exprs, by=['g'])

        formatter = ExprFormatter(result)
        formatted = formatter.get_result()

        alias = formatter.memo.get_alias(table.op())
        assert formatted.count(alias) == 7

    def test_aggregate_arg_names(self):
        # Not sure how to test this *well*

        t = self.table

        by_exprs = [t.g.name('key1'), t.f.round().name('key2')]
        agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')]

        expr = self.table.group_by(by_exprs).aggregate(agg_exprs)
        result = repr(expr)
        assert 'metrics' in result
        assert 'by' in result

    def test_format_multiple_join_with_projection(self):
        # Star schema with fact table
        table = ibis.table([
            ('c', 'int32'),
            ('f', 'double'),
            ('foo_id', 'string'),
            ('bar_id', 'string'),
        ])

        table2 = ibis.table([
            ('foo_id', 'string'),
            ('value1', 'double')
        ])

        table3 = ibis.table([
            ('bar_id', 'string'),
            ('value2', 'double')
        ])

        filtered = table[table['f'] > 0]

        pred1 = table['foo_id'] == table2['foo_id']
        pred2 = filtered['bar_id'] == table3['bar_id']

        j1 = filtered.left_join(table2, [pred1])
        j2 = j1.inner_join(table3, [pred2])

        # Project out the desired fields
        view = j2[[table, table2['value1'], table3['value2']]]

        # it works!
        repr(view)

    def test_memoize_database_table(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1

    def test_memoize_filtered_table(self):
        airlines = ibis.table([('dest', 'string'),
                               ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())

        result = repr(delay_filter)
        assert result.count('Filter') == 1

    def test_memoize_insert_sort_key(self):
        table = self.con.table('airlines')

        t = table['arrdelay', 'dest']
        expr = (t.group_by('dest')
                .mutate(dest_avg=t.arrdelay.mean(),
                        dev=t.arrdelay - t.arrdelay.mean()))

        worst = expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10)

        result = repr(worst)
        assert result.count('airlines') == 1

    def test_named_value_expr_show_name(self):
        expr = self.table.f * 2
        expr2 = expr.name('baz')

        # it works!
        repr(expr)

        result2 = repr(expr2)

        # not really committing to a particular output yet
        assert 'baz' in result2

    def test_memoize_filtered_tables_in_join(self):
        # related: GH #667
        purchases = ibis.table([('region', 'string'),
                                ('kind', 'string'),
                                ('user', 'int64'),
                                ('amount', 'double')], 'purchases')

        metric = purchases.amount.sum().name('total')
        agged = (purchases.group_by(['region', 'kind'])
                 .aggregate(metric))

        left = agged[agged.kind == 'foo']
        right = agged[agged.kind == 'bar']

        cond = left.region == right.region
        joined = left.join(right, cond)

        result = repr(joined)
        assert result.count('Filter') == 2
Пример #47
0
class TestValueExprs(unittest.TestCase, ExprSQLTest):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table("alltypes")

        self.int_cols = ["a", "b", "c", "d"]
        self.bool_cols = ["h"]
        self.float_cols = ["e", "f"]

    def _check_literals(self, cases):
        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_string_literals(self):
        cases = [("simple", "'simple'"), ("I can't", "'I can\\'t'"), ('An "escape"', "'An \"escape\"'")]

        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_decimal_builtins(self):
        t = self.con.table("tpch_lineitem")
        col = t.l_extendedprice
        cases = [(col.precision(), "precision(`l_extendedprice`)"), (col.scale(), "scale(`l_extendedprice`)")]
        self._check_expr_cases(cases)

    def test_number_boolean_literals(self):
        cases = [(5, "5"), (1.5, "1.5"), (True, "TRUE"), (False, "FALSE")]
        self._check_literals(cases)

    def test_column_ref_table_aliases(self):
        context = ImpalaContext()

        table1 = ibis.table([("key1", "string"), ("value1", "double")])

        table2 = ibis.table([("key2", "string"), ("value and2", "double")])

        context.set_ref(table1, "t0")
        context.set_ref(table2, "t1")

        expr = table1["value1"] - table2["value and2"]

        result = self._translate(expr, context=context)
        expected = "t0.`value1` - t1.`value and2`"
        assert result == expected

    def test_column_ref_quoting(self):
        schema = [("has a space", "double")]
        table = ibis.table(schema)
        self._translate(table["has a space"], "`has a space`")

    def test_identifier_quoting(self):
        schema = [("date", "double"), ("table", "string")]
        table = ibis.table(schema)
        self._translate(table["date"], "`date`")
        self._translate(table["table"], "`table`")

    def test_named_expressions(self):
        a, b, g = self.table.get_columns(["a", "b", "g"])

        cases = [
            (g.cast("double").name("g_dub"), "CAST(`g` AS double) AS `g_dub`"),
            (g.name("has a space"), "`g` AS `has a space`"),
            (((a - b) * a).name("expr"), "(`a` - `b`) * `a` AS `expr`"),
        ]

        return self._check_expr_cases(cases, named=True)

    def test_binary_infix_operators(self):
        # For each function, verify that the generated code is what we expect
        a, b, h = self.table.get_columns(["a", "b", "h"])
        bool_col = a > 0

        cases = [
            (a + b, "`a` + `b`"),
            (a - b, "`a` - `b`"),
            (a * b, "`a` * `b`"),
            (a / b, "`a` / `b`"),
            (a ** b, "pow(`a`, `b`)"),
            (a < b, "`a` < `b`"),
            (a <= b, "`a` <= `b`"),
            (a > b, "`a` > `b`"),
            (a >= b, "`a` >= `b`"),
            (a == b, "`a` = `b`"),
            (a != b, "`a` != `b`"),
            (h & bool_col, "`h` AND (`a` > 0)"),
            (h | bool_col, "`h` OR (`a` > 0)"),
            # xor is brute force
            (h ^ bool_col, "(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))"),
        ]
        self._check_expr_cases(cases)

    def test_binary_infix_parenthesization(self):
        a, b, c = self.table.get_columns(["a", "b", "c"])

        cases = [
            ((a + b) + c, "(`a` + `b`) + `c`"),
            (a.log() + c, "ln(`a`) + `c`"),
            (b + (-(a + c)), "`b` + (-(`a` + `c`))"),
        ]

        self._check_expr_cases(cases)

    def test_between(self):
        cases = [(self.table.f.between(0, 1), "`f` BETWEEN 0 AND 1")]
        self._check_expr_cases(cases)

    def test_isnull_notnull(self):
        cases = [
            (self.table["g"].isnull(), "`g` IS NULL"),
            (self.table["a"].notnull(), "`a` IS NOT NULL"),
            ((self.table["a"] + self.table["b"]).isnull(), "`a` + `b` IS NULL"),
        ]
        self._check_expr_cases(cases)

    def test_casts(self):
        a, d, g = self.table.get_columns(["a", "d", "g"])
        cases = [
            (a.cast("int16"), "CAST(`a` AS smallint)"),
            (a.cast("int32"), "CAST(`a` AS int)"),
            (a.cast("int64"), "CAST(`a` AS bigint)"),
            (a.cast("float"), "CAST(`a` AS float)"),
            (a.cast("double"), "CAST(`a` AS double)"),
            (a.cast("string"), "CAST(`a` AS string)"),
            (d.cast("int8"), "CAST(`d` AS tinyint)"),
            (g.cast("double"), "CAST(`g` AS double)"),
            (g.cast("timestamp"), "CAST(`g` AS timestamp)"),
        ]
        self._check_expr_cases(cases)

    def test_misc_conditionals(self):
        a = self.table.a
        cases = [(a.nullif(0), "nullif(`a`, 0)")]
        self._check_expr_cases(cases)

    def test_decimal_casts(self):
        cases = [
            (L("9.9999999").cast("decimal(38,5)"), "CAST('9.9999999' AS decimal(38,5))"),
            (self.table.f.cast("decimal(12,2)"), "CAST(`f` AS decimal(12,2))"),
        ]
        self._check_expr_cases(cases)

    def test_negate(self):
        cases = [(-self.table["a"], "-`a`"), (-self.table["f"], "-`f`"), (-self.table["h"], "NOT `h`")]
        self._check_expr_cases(cases)

    def test_timestamp_extract_field(self):
        fields = ["year", "month", "day", "hour", "minute", "second", "millisecond"]

        cases = [(getattr(self.table.i, field)(), "extract(`i`, '{0}')".format(field)) for field in fields]
        self._check_expr_cases(cases)

        # integration with SQL translation
        expr = self.table[
            self.table.i.year().name("year"), self.table.i.month().name("month"), self.table.i.day().name("day")
        ]

        result = to_sql(expr)
        expected = """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`,
       extract(`i`, 'day') AS `day`
FROM alltypes"""
        assert result == expected

    def test_timestamp_now(self):
        cases = [(ibis.now(), "now()")]
        self._check_expr_cases(cases)

    def test_timestamp_deltas(self):
        units = ["year", "month", "week", "day", "hour", "minute", "second", "millisecond", "microsecond"]

        t = self.table.i
        f = "`i`"

        cases = []
        for unit in units:
            K = 5
            offset = getattr(ibis, unit)(K)
            template = "{0}s_add({1}, {2})"

            cases.append((t + offset, template.format(unit, f, K)))
            cases.append((t - offset, template.format(unit, f, -K)))

        self._check_expr_cases(cases)

    def test_timestamp_literals(self):
        from pandas import Timestamp

        tv1 = "2015-01-01 12:34:56"
        ex1 = "'2015-01-01 12:34:56'"

        cases = [(L(Timestamp(tv1)), ex1), (L(Timestamp(tv1).to_pydatetime()), ex1), (ibis.timestamp(tv1), ex1)]
        self._check_expr_cases(cases)

    def test_timestamp_from_integer(self):
        col = self.table.c

        cases = [
            (col.to_timestamp(), 'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") ' "AS timestamp)"),
            (
                col.to_timestamp("ms"),
                "CAST(from_unixtime(CAST(`c` / 1000 AS int), " '"yyyy-MM-dd HH:mm:ss") ' "AS timestamp)",
            ),
            (
                col.to_timestamp("us"),
                "CAST(from_unixtime(CAST(`c` / 1000000 AS int), " '"yyyy-MM-dd HH:mm:ss") ' "AS timestamp)",
            ),
        ]
        self._check_expr_cases(cases)

    def test_correlated_predicate_subquery(self):
        t0 = self.table
        t1 = t0.view()

        expr = t0.g == t1.g

        ctx = ImpalaContext()
        ctx.make_alias(t0)

        # Grab alias from parent context
        subctx = ctx.subcontext()
        subctx.make_alias(t1)
        subctx.make_alias(t0)

        result = self._translate(expr, context=subctx)
        expected = "t0.`g` = t1.`g`"
        assert result == expected

    def test_any_all(self):
        t = self.table

        bool_expr = t.f == 0

        cases = [
            (bool_expr.any(), "sum(`f` = 0) > 0"),
            (-bool_expr.any(), "sum(`f` = 0) = 0"),
            (bool_expr.all(), "sum(`f` = 0) = count(*)"),
            (-bool_expr.all(), "sum(`f` = 0) < count(*)"),
        ]
        self._check_expr_cases(cases)
Пример #48
0
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_numeric_unary_builtins(self):
        # No argument functions
        functions = ['abs', 'ceil', 'floor', 'exp', 'sqrt', 'sign',
                     ('log', 'ln'),
                     ('approx_median', 'appx_median'),
                     ('approx_nunique', 'ndv'),
                     'ln', 'log2', 'log10', 'nullifzero', 'zeroifnull']

        cases = []
        for what in functions:
            if isinstance(what, tuple):
                ibis_name, sql_name = what
            else:
                ibis_name = sql_name = what

            for cname in ['double_col', 'int_col']:
                expr = getattr(self.table[cname], ibis_name)()
                cases.append((expr, '{0}({1})'.format(
                    sql_name, '`{0}`'.format(cname))))

        self._check_expr_cases(cases)

    def test_log_other_bases(self):
        cases = [
            (self.table.double_col.log(5), 'log(`double_col`, 5)')
        ]
        self._check_expr_cases(cases)

    def test_round(self):
        cases = [
            (self.table.double_col.round(), 'round(`double_col`)'),
            (self.table.double_col.round(0), 'round(`double_col`, 0)'),
            (self.table.double_col.round(2, ), 'round(`double_col`, 2)'),
            (self.table.double_col.round(self.table.tinyint_col),
             'round(`double_col`, `tinyint_col`)')
        ]
        self._check_expr_cases(cases)

    def test_hash(self):
        expr = self.table.int_col.hash()
        assert isinstance(expr, ir.Int64Array)
        assert isinstance(self.table.int_col.sum().hash(),
                          ir.Int64Scalar)

        cases = [
            (self.table.int_col.hash(), 'fnv_hash(`int_col`)')
        ]
        self._check_expr_cases(cases)

    def test_reduction_where(self):
        cond = self.table.bigint_col < 70
        c = self.table.double_col
        tmp = ('{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` '
               'ELSE NULL END)')
        cases = [
            (c.sum(where=cond), tmp.format('sum')),
            (c.count(where=cond), tmp.format('count')),
            (c.mean(where=cond), tmp.format('avg')),
            (c.max(where=cond), tmp.format('max')),
            (c.min(where=cond), tmp.format('min')),
            (c.std(where=cond), tmp.format('stddev')),
            (c.std(where=cond, how='pop'), tmp.format('stddev_pop')),
            (c.var(where=cond), tmp.format('variance')),
            (c.var(where=cond, how='pop'), tmp.format('variance_pop')),
        ]
        self._check_expr_cases(cases)

    def test_reduction_invalid_where(self):
        condbad_literal = L('T')
        c = self.table.double_col
        for reduction in [c.sum, c.count, c.mean, c.max, c.min]:
            with self.assertRaises(TypeError):
                reduction(where=condbad_literal)
Пример #49
0
class TestExprFormatting(unittest.TestCase):
    # Uncertain about how much we want to commit to unit tests around the
    # particulars of the output at the moment.

    def setUp(self):
        self.schema = [('a', 'int8'), ('b', 'int16'), ('c', 'int32'),
                       ('d', 'int64'), ('e', 'float'), ('f', 'double'),
                       ('g', 'string'), ('h', 'boolean')]
        self.schema_dict = dict(self.schema)
        self.table = ibis.table(self.schema)
        self.con = MockConnection()

    def test_format_table_column(self):
        # GH #507
        result = repr(self.table.f)
        assert 'Column[array(double)]' in result

    def test_format_projection(self):
        # This should produce a ref to the projection
        proj = self.table[['c', 'a', 'f']]
        repr(proj['a'])

    def test_table_type_output(self):
        foo = ibis.table([('job', 'string'), ('dept_id', 'string'),
                          ('year', 'int32'), ('y', 'double')], 'foo')

        expr = foo.dept_id == foo.view().dept_id
        result = repr(expr)
        assert 'SelfReference[table]' in result
        assert 'UnboundTable[table]' in result

    def test_memoize_aggregate_correctly(self):
        table = self.table

        agg_expr = (table['c'].sum() / table['c'].mean() - 1).name('analysis')
        agg_exprs = [
            table['a'].sum().name('sum(a)'), table['b'].mean().name('mean(b)'),
            agg_expr
        ]

        result = table.aggregate(agg_exprs, by=['g'])

        formatter = ExprFormatter(result)
        formatted = formatter.get_result()

        alias = formatter.memo.get_alias(table)
        assert formatted.count(alias) == 7

    def test_aggregate_arg_names(self):
        # Not sure how to test this *well*

        t = self.table

        by_exprs = [t.g.name('key1'), t.f.round().name('key2')]
        agg_exprs = [t.c.sum().name('c'), t.d.mean().name('d')]

        expr = self.table.group_by(by_exprs).aggregate(agg_exprs)
        result = repr(expr)
        assert 'metrics' in result
        assert 'by' in result

    def test_format_multiple_join_with_projection(self):
        # Star schema with fact table
        table = ibis.table([
            ('c', 'int32'),
            ('f', 'double'),
            ('foo_id', 'string'),
            ('bar_id', 'string'),
        ], 'one')

        table2 = ibis.table([('foo_id', 'string'), ('value1', 'double')],
                            'two')

        table3 = ibis.table([('bar_id', 'string'), ('value2', 'double')],
                            'three')

        filtered = table[table['f'] > 0]

        pred1 = filtered['foo_id'] == table2['foo_id']
        pred2 = filtered['bar_id'] == table3['bar_id']

        j1 = filtered.left_join(table2, [pred1])
        j2 = j1.inner_join(table3, [pred2])

        # Project out the desired fields
        view = j2[[filtered, table2['value1'], table3['value2']]]

        # it works!
        repr(view)

    def test_memoize_database_table(self):
        table = self.con.table('test1')
        table2 = self.con.table('test2')

        filter_pred = table['f'] > 0
        table3 = table[filter_pred]
        join_pred = table3['g'] == table2['key']

        joined = table2.inner_join(table3, [join_pred])

        met1 = (table3['f'] - table2['value']).mean().name('foo')
        result = joined.aggregate([met1, table3['f'].sum().name('bar')],
                                  by=[table3['g'], table2['key']])

        formatted = repr(result)
        assert formatted.count('test1') == 1
        assert formatted.count('test2') == 1

    def test_memoize_filtered_table(self):
        airlines = ibis.table([('dest', 'string'), ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())

        result = repr(delay_filter)
        assert result.count('Selection') == 1

    def test_memoize_insert_sort_key(self):
        table = self.con.table('airlines')

        t = table['arrdelay', 'dest']
        expr = (t.group_by('dest').mutate(dest_avg=t.arrdelay.mean(),
                                          dev=t.arrdelay - t.arrdelay.mean()))

        worst = (expr[expr.dev.notnull()].sort_by(ibis.desc('dev')).limit(10))

        result = repr(worst)
        assert result.count('airlines') == 1

    def test_named_value_expr_show_name(self):
        expr = self.table.f * 2
        expr2 = expr.name('baz')

        # it works!
        repr(expr)

        result2 = repr(expr2)

        # not really committing to a particular output yet
        assert 'baz' in result2

    def test_memoize_filtered_tables_in_join(self):
        # related: GH #667
        purchases = ibis.table([('region', 'string'), ('kind', 'string'),
                                ('user', 'int64'), ('amount', 'double')],
                               'purchases')

        metric = purchases.amount.sum().name('total')
        agged = (purchases.group_by(['region', 'kind']).aggregate(metric))

        left = agged[agged.kind == 'foo']
        right = agged[agged.kind == 'bar']

        cond = left.region == right.region
        joined = (left.join(right, cond)[left,
                                         right.total.name('right_total')])

        result = repr(joined)

        # Join, and one for each aggregation
        assert result.count('predicates') == 3
Пример #50
0
class TestValueExprs(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

        self.int_cols = ['a', 'b', 'c', 'd']
        self.bool_cols = ['h']
        self.float_cols = ['e', 'f']

    def _check_literals(self, cases):
        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_string_literals(self):
        cases = [
            ('simple', "'simple'"),
            ('I can\'t', "'I can\\'t'"),
            ('An "escape"', "'An \"escape\"'")
        ]

        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_decimal_builtins(self):
        t = self.con.table('tpch_lineitem')
        col = t.l_extendedprice
        cases = [
            (col.precision(), 'precision(`l_extendedprice`)'),
            (col.scale(), 'scale(`l_extendedprice`)'),
        ]
        self._check_expr_cases(cases)

    def test_number_boolean_literals(self):
        cases = [
            (5, '5'),
            (1.5, '1.5'),
            (True, 'TRUE'),
            (False, 'FALSE')
        ]
        self._check_literals(cases)

    def test_column_ref_table_aliases(self):
        context = ImpalaContext()

        table1 = ibis.table([
            ('key1', 'string'),
            ('value1', 'double')
        ])

        table2 = ibis.table([
            ('key2', 'string'),
            ('value and2', 'double')
        ])

        context.set_ref(table1, 't0')
        context.set_ref(table2, 't1')

        expr = table1['value1'] - table2['value and2']

        result = self._translate(expr, context=context)
        expected = 't0.`value1` - t1.`value and2`'
        assert result == expected

    def test_column_ref_quoting(self):
        schema = [('has a space', 'double')]
        table = ibis.table(schema)
        self._translate(table['has a space'], '`has a space`')

    def test_identifier_quoting(self):
        schema = [('date', 'double'), ('table', 'string')]
        table = ibis.table(schema)
        self._translate(table['date'], '`date`')
        self._translate(table['table'], '`table`')

    def test_named_expressions(self):
        a, b, g = self.table.get_columns(['a', 'b', 'g'])

        cases = [
            (g.cast('double').name('g_dub'), 'CAST(`g` AS double) AS `g_dub`'),
            (g.name('has a space'), '`g` AS `has a space`'),
            (((a - b) * a).name('expr'), '(`a` - `b`) * `a` AS `expr`')
        ]

        return self._check_expr_cases(cases, named=True)

    def test_binary_infix_operators(self):
        # For each function, verify that the generated code is what we expect
        a, b, h = self.table.get_columns(['a', 'b', 'h'])
        bool_col = a > 0

        cases = [
            (a + b, '`a` + `b`'),
            (a - b, '`a` - `b`'),
            (a * b, '`a` * `b`'),
            (a / b, '`a` / `b`'),
            (a ** b, 'pow(`a`, `b`)'),
            (a < b, '`a` < `b`'),
            (a <= b, '`a` <= `b`'),
            (a > b, '`a` > `b`'),
            (a >= b, '`a` >= `b`'),
            (a == b, '`a` = `b`'),
            (a != b, '`a` != `b`'),
            (h & bool_col, '`h` AND (`a` > 0)'),
            (h | bool_col, '`h` OR (`a` > 0)'),
            # xor is brute force
            (h ^ bool_col, '(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))')
        ]
        self._check_expr_cases(cases)

    def test_binary_infix_parenthesization(self):
        a, b, c = self.table.get_columns(['a', 'b', 'c'])

        cases = [
            ((a + b) + c, '(`a` + `b`) + `c`'),
            (a.log() + c, 'ln(`a`) + `c`'),
            (b + (-(a + c)), '`b` + (-(`a` + `c`))')
        ]

        self._check_expr_cases(cases)

    def test_between(self):
        cases = [
            (self.table.f.between(0, 1), '`f` BETWEEN 0 AND 1')
        ]
        self._check_expr_cases(cases)

    def test_isnull_notnull(self):
        cases = [
            (self.table['g'].isnull(), '`g` IS NULL'),
            (self.table['a'].notnull(), '`a` IS NOT NULL'),
            ((self.table['a'] + self.table['b']).isnull(),
             '`a` + `b` IS NULL')
        ]
        self._check_expr_cases(cases)

    def test_casts(self):
        a, d, g = self.table.get_columns(['a', 'd', 'g'])
        cases = [
            (a.cast('int16'), 'CAST(`a` AS smallint)'),
            (a.cast('int32'), 'CAST(`a` AS int)'),
            (a.cast('int64'), 'CAST(`a` AS bigint)'),
            (a.cast('float'), 'CAST(`a` AS float)'),
            (a.cast('double'), 'CAST(`a` AS double)'),
            (a.cast('string'), 'CAST(`a` AS string)'),
            (d.cast('int8'), 'CAST(`d` AS tinyint)'),
            (g.cast('double'), 'CAST(`g` AS double)'),
            (g.cast('timestamp'), 'CAST(`g` AS timestamp)')
        ]
        self._check_expr_cases(cases)

    def test_misc_conditionals(self):
        a = self.table.a
        cases = [
            (a.nullif(0), 'nullif(`a`, 0)')
        ]
        self._check_expr_cases(cases)

    def test_decimal_casts(self):
        cases = [
            (L('9.9999999').cast('decimal(38,5)'),
             "CAST('9.9999999' AS decimal(38,5))"),
            (self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))")
        ]
        self._check_expr_cases(cases)

    def test_negate(self):
        cases = [
            (-self.table['a'], '-`a`'),
            (-self.table['f'], '-`f`'),
            (-self.table['h'], 'NOT `h`')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_extract_field(self):
        fields = ['year', 'month', 'day', 'hour', 'minute',
                  'second', 'millisecond']

        cases = [(getattr(self.table.i, field)(),
                  "extract(`i`, '{0}')".format(field))
                 for field in fields]
        self._check_expr_cases(cases)

        # integration with SQL translation
        expr = self.table[self.table.i.year().name('year'),
                          self.table.i.month().name('month'),
                          self.table.i.day().name('day')]

        result = to_sql(expr)
        expected = \
            """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`,
       extract(`i`, 'day') AS `day`
FROM alltypes"""
        assert result == expected

    def test_timestamp_now(self):
        cases = [
            (ibis.now(), 'now()')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_deltas(self):
        units = ['year', 'month', 'week', 'day',
                 'hour', 'minute', 'second',
                 'millisecond', 'microsecond']

        t = self.table.i
        f = '`i`'

        cases = []
        for unit in units:
            K = 5
            offset = getattr(ibis, unit)(K)
            template = '{0}s_add({1}, {2})'

            cases.append((t + offset, template.format(unit, f, K)))
            cases.append((t - offset, template.format(unit, f, -K)))

        self._check_expr_cases(cases)

    def test_timestamp_literals(self):
        from pandas import Timestamp

        tv1 = '2015-01-01 12:34:56'
        ex1 = ("'2015-01-01 12:34:56'")

        cases = [
            (L(Timestamp(tv1)), ex1),
            (L(Timestamp(tv1).to_pydatetime()), ex1),
            (ibis.timestamp(tv1), ex1)
        ]
        self._check_expr_cases(cases)

    def test_timestamp_from_integer(self):
        col = self.table.c

        cases = [
            (col.to_timestamp(),
             'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('ms'),
             'CAST(from_unixtime(CAST(`c` / 1000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('us'),
             'CAST(from_unixtime(CAST(`c` / 1000000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
        ]
        self._check_expr_cases(cases)

    def test_correlated_predicate_subquery(self):
        t0 = self.table
        t1 = t0.view()

        expr = t0.g == t1.g

        ctx = ImpalaContext()
        ctx.make_alias(t0)

        # Grab alias from parent context
        subctx = ctx.subcontext()
        subctx.make_alias(t1)
        subctx.make_alias(t0)

        result = self._translate(expr, context=subctx)
        expected = "t0.`g` = t1.`g`"
        assert result == expected

    def test_any_all(self):
        t = self.table

        bool_expr = t.f == 0

        cases = [
            (bool_expr.any(), 'sum(`f` = 0) > 0'),
            (-bool_expr.any(), 'sum(`f` = 0) = 0'),
            (bool_expr.all(), 'sum(`f` = 0) = count(*)'),
            (-bool_expr.all(), 'sum(`f` = 0) < count(*)'),
        ]
        self._check_expr_cases(cases)
Пример #51
0
class TestFixedOffsets(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_upconvert(self):
        cases = [
            (T.day(14), 'w', T.week(2)),
            (T.hour(72), 'd', T.day(3)),
            (T.minute(240), 'h', T.hour(4)),
            (T.second(360), 'm', T.minute(6)),
            (T.second(3 * 86400), 'd', T.day(3)),
            (T.millisecond(5000), 's', T.second(5)),
            (T.microsecond(5000000), 's', T.second(5)),
            (T.nanosecond(5000000000), 's', T.second(5)),
        ]

        for offset, unit, expected in cases:
            result = offset.to_unit(unit)
            assert result.equals(expected)

    def test_multiply(self):
        offset = T.day(2)

        assert (offset * 2).equals(T.day(4))
        assert (offset * (-2)).equals(T.day(-4))
        assert (3 * offset).equals(T.day(6))
        assert ((-3) * offset).equals(T.day(-6))

    def test_repr(self):
        assert repr(T.day()) == '<Timedelta: 1 day>'
        assert repr(T.day(2)) == '<Timedelta: 2 days>'
        assert repr(T.year()) == '<Timedelta: 1 year>'
        assert repr(T.month(2)) == '<Timedelta: 2 months>'
        assert repr(T.second(40)) == '<Timedelta: 40 seconds>'

    def test_cannot_upconvert(self):
        cases = [
            (T.day(), 'w'),
            (T.hour(), 'd'),
            (T.minute(), 'h'),
            (T.second(), 'm'),
            (T.second(), 'd'),
            (T.millisecond(), 's'),
            (T.microsecond(), 's'),
            (T.nanosecond(), 's'),
        ]

        for delta, target in cases:
            self.assertRaises(IbisError, delta.to_unit, target)

    def test_downconvert_second_parts(self):
        K = 2

        sec = T.second(K)
        milli = T.millisecond(K)
        micro = T.microsecond(K)
        nano = T.nanosecond(K)

        cases = [(sec.to_unit('s'), T.second(K)),
                 (sec.to_unit('ms'), T.millisecond(K * 1000)),
                 (sec.to_unit('us'), T.microsecond(K * 1000000)),
                 (sec.to_unit('ns'), T.nanosecond(K * 1000000000)),
                 (milli.to_unit('ms'), T.millisecond(K)),
                 (milli.to_unit('us'), T.microsecond(K * 1000)),
                 (milli.to_unit('ns'), T.nanosecond(K * 1000000)),
                 (micro.to_unit('us'), T.microsecond(K)),
                 (micro.to_unit('ns'), T.nanosecond(K * 1000)),
                 (nano.to_unit('ns'), T.nanosecond(K))]
        self._check_cases(cases)

    def test_downconvert_hours(self):
        K = 2
        offset = T.hour(K)

        cases = [(offset.to_unit('h'), T.hour(K)),
                 (offset.to_unit('m'), T.minute(K * 60)),
                 (offset.to_unit('s'), T.second(K * 3600)),
                 (offset.to_unit('ms'), T.millisecond(K * 3600000)),
                 (offset.to_unit('us'), T.microsecond(K * 3600000000)),
                 (offset.to_unit('ns'), T.nanosecond(K * 3600000000000))]
        self._check_cases(cases)

    def test_downconvert_day(self):
        K = 2

        week = T.week(K)
        day = T.day(K)

        cases = [(week.to_unit('d'), T.day(K * 7)),
                 (week.to_unit('h'), T.hour(K * 7 * 24)),
                 (day.to_unit('d'), T.day(K)),
                 (day.to_unit('h'), T.hour(K * 24)),
                 (day.to_unit('m'), T.minute(K * 1440)),
                 (day.to_unit('s'), T.second(K * 86400)),
                 (day.to_unit('ms'), T.millisecond(K * 86400000)),
                 (day.to_unit('us'), T.microsecond(K * 86400000000)),
                 (day.to_unit('ns'), T.nanosecond(K * 86400000000000))]
        self._check_cases(cases)

    def test_combine_with_different_kinds(self):
        cases = [(T.day() + T.minute(), T.minute(1441)),
                 (T.second() + T.millisecond(10), T.millisecond(1010)),
                 (T.hour() + T.minute(5) + T.second(10), T.second(3910))]
        self._check_cases(cases)

    def test_timedelta_generic_api(self):
        cases = [
            (T.timedelta(weeks=2), T.week(2)),
            (T.timedelta(days=3), T.day(3)),
            (T.timedelta(hours=4), T.hour(4)),
            (T.timedelta(minutes=5), T.minute(5)),
            (T.timedelta(seconds=6), T.second(6)),
            (T.timedelta(milliseconds=7), T.millisecond(7)),
            (T.timedelta(microseconds=8), T.microsecond(8)),
            (T.timedelta(nanoseconds=9), T.nanosecond(9)),
        ]
        self._check_cases(cases)

    def _check_cases(self, cases):
        for x, y in cases:
            assert x.equals(y)

    def test_offset_timestamp_expr(self):
        c = self.table.i
        x = T.timedelta(days=1)

        expr = x + c
        assert isinstance(expr, ir.TimestampColumn)
        assert isinstance(expr.op(), ops.TimestampDelta)

        # test radd
        expr = c + x
        assert isinstance(expr, ir.TimestampColumn)
        assert isinstance(expr.op(), ops.TimestampDelta)
Пример #52
0
class UDFTest(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

        self.i8 = self.table.tinyint_col
        self.i16 = self.table.smallint_col
        self.i32 = self.table.int_col
        self.i64 = self.table.bigint_col
        self.d = self.table.double_col
        self.f = self.table.float_col
        self.s = self.table.string_col
        self.b = self.table.bool_col
        self.t = self.table.timestamp_col
        self.dec = self.con.table('tpch_customer').c_acctbal
        self.all_cols = [self.i8, self.i16, self.i32, self.i64, self.d,
                         self.f, self.dec, self.s, self.b, self.t]

    def test_sql_generation(self):
        op = udf.scalar_function(['string'], 'string', name='Tester')
        udf.add_impala_operation(op, 'identity', 'udf_testing')

        def _identity_test(value):
            return op(value).to_expr()
        result = _identity_test('hello world')
        assert result == "SELECT udf_testing.identity('hello world')"

    def test_sql_generation_from_infoclass(self):
        udf_info = udf.UDFCreator('test.so', ['string'], 'string', 'info_test')
        op = udf_info.to_operation()
        udf.add_impala_operation(op, 'info_test', 'udf_testing')
        assert op in _operation_registry

        def _infoclass_test(value):
            return op(value).to_expr()
        result = _infoclass_test('hello world')

        assert result == "SELECT udf_testing.info_test('hello world')"

    def test_boolean_wrapping(self):
        func = self._udf_registration_single_input('boolean',
                                                   'boolean',
                                                   'test')
        expr = func(True)
        assert type(expr) == ir.BooleanScalar
        expr = func(self.b)
        assert type(expr) == ir.BooleanArray

    def test_tinyint_wrapping(self):
        func = self._udf_registration_single_input('int8',
                                                   'int8',
                                                   'test')
        expr = func(1)
        assert type(expr) == ir.Int8Scalar
        expr = func(self.i8)
        assert type(expr) == ir.Int8Array

    def test_smallint_wrapping(self):
        func = self._udf_registration_single_input('int16',
                                                   'int16',
                                                   'test')
        expr = func(1)
        assert type(expr) == ir.Int16Scalar
        expr = func(self.i16)
        assert type(expr) == ir.Int16Array

    def test_int_wrapping(self):
        func = self._udf_registration_single_input('int32',
                                                   'int32',
                                                   'test')
        expr = func(1)
        assert type(expr) == ir.Int32Scalar
        expr = func(self.i32)
        assert type(expr) == ir.Int32Array

    def test_bigint_wrapping(self):
        func = self._udf_registration_single_input('int64',
                                                   'int64',
                                                   'test')
        expr = func(1)
        assert type(expr) == ir.Int64Scalar
        expr = func(self.i64)
        assert type(expr) == ir.Int64Array

    def test_float_wrapping(self):
        func = self._udf_registration_single_input('float',
                                                   'float',
                                                   'test')
        expr = func(1.0)
        assert type(expr) == ir.FloatScalar
        expr = func(self.f)
        assert type(expr) == ir.FloatArray

    def test_double_wrapping(self):
        func = self._udf_registration_single_input('double',
                                                   'double',
                                                   'test')
        expr = func(1.0)
        assert type(expr) == ir.DoubleScalar
        expr = func(self.d)
        assert type(expr) == ir.DoubleArray

    def test_decimal_wrapping(self):
        func = self._udf_registration_single_input('decimal(9,0)',
                                                   'decimal(9,0)',
                                                   'test')
        expr = func(1.0)
        assert type(expr) == ir.DecimalScalar
        expr = func(self.dec)
        assert type(expr) == ir.DecimalArray

    def test_string_wrapping(self):
        func = self._udf_registration_single_input('string',
                                                   'string',
                                                   'test')
        expr = func('1')
        assert type(expr) == ir.StringScalar
        expr = func(self.s)
        assert type(expr) == ir.StringArray

    def test_timestamp_wrapping(self):
        func = self._udf_registration_single_input('timestamp',
                                                   'timestamp',
                                                   'test')
        expr = func(ibis.timestamp('1961-04-10'))
        assert type(expr) == ir.TimestampScalar
        expr = func(self.t)
        assert type(expr) == ir.TimestampArray

    def test_invalid_typecasting_tinyint(self):
        self._invalid_typecasts('int8', self.all_cols[1:])

    def test_invalid_typecasting_smallint(self):
        self._invalid_typecasts('int16', self.all_cols[2:])

    def test_invalid_typecasting_int(self):
        self._invalid_typecasts('int32', self.all_cols[3:])

    def test_invalid_typecasting_bigint(self):
        self._invalid_typecasts('int64', self.all_cols[4:])

    def test_invalid_typecasting_boolean(self):
        self._invalid_typecasts('boolean', self.all_cols[:8] +
                                self.all_cols[9:])

    def test_invalid_typecasting_float(self):
        self._invalid_typecasts('float', self.all_cols[:4] +
                                self.all_cols[6:])

    def test_invalid_typecasting_double(self):
        self._invalid_typecasts('double', self.all_cols[:4] +
                                self.all_cols[6:])

    def test_invalid_typecasting_string(self):
        self._invalid_typecasts('string', self.all_cols[:7] +
                                self.all_cols[8:])

    def test_invalid_typecasting_timestamp(self):
        self._invalid_typecasts('timestamp', self.all_cols[:-1])

    def test_invalid_typecasting_decimal(self):
        self._invalid_typecasts('decimal', self.all_cols[:4] +
                                self.all_cols[7:])

    def test_mult_args(self):
        op = self._udf_registration(['int32', 'double', 'string',
                                     'boolean', 'timestamp'],
                                    'int64', 'mult_types')

        def _func(integer, double, string, boolean, timestamp):
            return op(integer, double, string, boolean, timestamp).to_expr()

        expr = _func(self.i32, self.d, self.s, self.b, self.t)
        assert issubclass(type(expr), ir.ArrayExpr)

        expr = _func(1, 1.0, 'a', True, ibis.timestamp('1961-04-10'))
        assert issubclass(type(expr), ir.ScalarExpr)

    def _udf_registration_single_input(self, inputs, output, name):
        op = self._udf_registration([inputs], output, name)

        def _test_func(value):
            return op(value).to_expr()
        return _test_func

    def _udf_registration(self, inputs, output, name):
        op = udf.scalar_function(inputs, output, name=name)
        assert issubclass(op, ValueOp)
        udf.add_impala_operation(op, name, 'ibis_testing')
        return op

    def _invalid_typecasts(self, inputs, invalid_casts):
        func = self._udf_registration_single_input(inputs,
                                                   'int32',
                                                   'typecast')
        for in_type in invalid_casts:
            self.assertRaises(IbisTypeError, func, in_type)
Пример #53
0
class TestCaseExprs(unittest.TestCase, ExprSQLTest, ExprTestCases):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

    def test_isnull_1_0(self):
        expr = self.table.g.isnull().ifelse(1, 0)

        result = self._translate(expr)
        expected = 'CASE WHEN `g` IS NULL THEN 1 ELSE 0 END'
        assert result == expected

        # inside some other function
        result = self._translate(expr.sum())
        expected = 'sum(CASE WHEN `g` IS NULL THEN 1 ELSE 0 END)'
        assert result == expected

    def test_simple_case(self):
        expr = self._case_simple_case()
        result = self._translate(expr)
        expected = """CASE `g`
  WHEN 'foo' THEN 'bar'
  WHEN 'baz' THEN 'qux'
  ELSE 'default'
END"""
        assert result == expected

    def test_search_case(self):
        expr = self._case_search_case()
        result = self._translate(expr)
        expected = """CASE
  WHEN `f` > 0 THEN `d` * 2
  WHEN `c` < 0 THEN `a` * 2
  ELSE NULL
END"""
        assert result == expected

    def test_where_use_if(self):
        expr = ibis.where(self.table.f > 0, self.table.e, self.table.a)
        assert isinstance(expr, ir.FloatValue)

        result = self._translate(expr)
        expected = "if(`f` > 0, `e`, `a`)"
        assert result == expected

    def test_nullif_ifnull(self):
        table = self.con.table('tpch_lineitem')

        f = table.l_quantity

        cases = [
            (f.nullif(f == 0),
             'nullif(`l_quantity`, `l_quantity` = 0)'),
            (f.fillna(0),
             'isnull(`l_quantity`, CAST(0 AS decimal(12,2)))'),
        ]
        self._check_expr_cases(cases)

    def test_decimal_fillna_cast_arg(self):
        table = self.con.table('tpch_lineitem')
        f = table.l_extendedprice

        cases = [
            (f.fillna(0),
             'isnull(`l_extendedprice`, CAST(0 AS decimal(12,2)))'),
            (f.fillna(0.0), 'isnull(`l_extendedprice`, 0.0)'),
        ]
        self._check_expr_cases(cases)
Пример #54
0
class TestStringBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_unary_ops(self):
        s = self.table.string_col
        cases = [
            (s.lower(), 'lower(`string_col`)'),
            (s.upper(), 'upper(`string_col`)'),
            (s.reverse(), 'reverse(`string_col`)'),
            (s.strip(), 'trim(`string_col`)'),
            (s.lstrip(), 'ltrim(`string_col`)'),
            (s.rstrip(), 'rtrim(`string_col`)'),
            (s.capitalize(), 'initcap(`string_col`)'),
            (s.length(), 'length(`string_col`)'),
            (s.ascii_str(), 'ascii(`string_col`)')
        ]
        self._check_expr_cases(cases)

    def test_substr(self):
        # Database numbers starting from 1
        cases = [
            (self.table.string_col.substr(2), 'substr(`string_col`, 2 + 1)'),
            (self.table.string_col.substr(0, 3),
             'substr(`string_col`, 0 + 1, 3)')
        ]
        self._check_expr_cases(cases)

    def test_strright(self):
        cases = [
            (self.table.string_col.right(4), 'strright(`string_col`, 4)')
        ]
        self._check_expr_cases(cases)

    def test_like(self):
        cases = [
            (self.table.string_col.like('foo%'), "`string_col` LIKE 'foo%'")
        ]
        self._check_expr_cases(cases)

    def test_rlike(self):
        ex = "`string_col` RLIKE '[\d]+'"
        cases = [
            (self.table.string_col.rlike('[\d]+'), ex),
            (self.table.string_col.re_search('[\d]+'), ex),
        ]
        self._check_expr_cases(cases)

    def test_re_extract(self):
        sql = "regexp_extract(`string_col`, '[\d]+', 0)"
        cases = [
            (self.table.string_col.re_extract('[\d]+', 0), sql)
        ]
        self._check_expr_cases(cases)

    def test_re_replace(self):
        sql = "regexp_replace(`string_col`, '[\d]+', 'aaa')"
        cases = [
            (self.table.string_col.re_replace('[\d]+', 'aaa'), sql)
        ]
        self._check_expr_cases(cases)

    def test_parse_url(self):
        sql = "parse_url(`string_col`, 'HOST')"
        cases = [
            (self.table.string_col.parse_url('HOST'), sql)
        ]
        self._check_expr_cases(cases)

    def test_repeat(self):
        cases = [
            (self.table.string_col.repeat(2), 'repeat(`string_col`, 2)')
        ]
        self._check_expr_cases(cases)

    def test_translate(self):
        cases = [
            (self.table.string_col.translate('a', 'b'),
             "translate(`string_col`, 'a', 'b')")
        ]
        self._check_expr_cases(cases)

    def test_find(self):
        s = self.table.string_col
        i1 = self.table.tinyint_col
        cases = [
            (s.find('a'), "locate('a', `string_col`) - 1"),
            (s.find('a', 2), "locate('a', `string_col`, 3) - 1"),
            (s.find('a', start=i1),
             "locate('a', `string_col`, `tinyint_col` + 1) - 1")
        ]
        self._check_expr_cases(cases)

    def test_lpad(self):
        cases = [
            (self.table.string_col.lpad(1, 'a'), "lpad(`string_col`, 1, 'a')"),
            (self.table.string_col.lpad(25), "lpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_rpad(self):
        cases = [
            (self.table.string_col.rpad(1, 'a'), "rpad(`string_col`, 1, 'a')"),
            (self.table.string_col.rpad(25), "rpad(`string_col`, 25, ' ')")
        ]
        self._check_expr_cases(cases)

    def test_find_in_set(self):
        cases = [
            (self.table.string_col.find_in_set(['a']),
             "find_in_set(`string_col`, 'a') - 1"),
            (self.table.string_col.find_in_set(['a', 'b']),
             "find_in_set(`string_col`, 'a,b') - 1")
        ]
        self._check_expr_cases(cases)

    def test_string_join(self):
        cases = [
            (L(',').join(['a', 'b']), "concat_ws(',', 'a', 'b')")
        ]
        self._check_expr_cases(cases)
Пример #55
0
class TestValueExprs(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('alltypes')

        self.int_cols = ['a', 'b', 'c', 'd']
        self.bool_cols = ['h']
        self.float_cols = ['e', 'f']

    def _check_literals(self, cases):
        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_string_literals(self):
        cases = [
            ('simple', "'simple'"),
            ('I can\'t', "'I can\\'t'"),
            ('An "escape"', "'An \"escape\"'")
        ]

        for value, expected in cases:
            lit_expr = L(value)
            result = self._translate(lit_expr)
            assert result == expected

    def test_decimal_builtins(self):
        t = self.con.table('tpch_lineitem')
        col = t.l_extendedprice
        cases = [
            (col.precision(), 'precision(`l_extendedprice`)'),
            (col.scale(), 'scale(`l_extendedprice`)'),
        ]
        self._check_expr_cases(cases)

    def test_number_boolean_literals(self):
        cases = [
            (5, '5'),
            (1.5, '1.5'),
            (True, 'TRUE'),
            (False, 'FALSE')
        ]
        self._check_literals(cases)

    def test_column_ref_table_aliases(self):
        context = ImpalaContext()

        table1 = ibis.table([
            ('key1', 'string'),
            ('value1', 'double')
        ])

        table2 = ibis.table([
            ('key2', 'string'),
            ('value and2', 'double')
        ])

        context.set_ref(table1, 't0')
        context.set_ref(table2, 't1')

        expr = table1['value1'] - table2['value and2']

        result = self._translate(expr, context=context)
        expected = 't0.`value1` - t1.`value and2`'
        assert result == expected

    def test_column_ref_quoting(self):
        schema = [('has a space', 'double')]
        table = ibis.table(schema)
        self._translate(table['has a space'], '`has a space`')

    def test_identifier_quoting(self):
        schema = [('date', 'double'), ('table', 'string')]
        table = ibis.table(schema)
        self._translate(table['date'], '`date`')
        self._translate(table['table'], '`table`')

    def test_named_expressions(self):
        a, b, g = self.table.get_columns(['a', 'b', 'g'])

        cases = [
            (g.cast('double').name('g_dub'), 'CAST(`g` AS double) AS `g_dub`'),
            (g.name('has a space'), '`g` AS `has a space`'),
            (((a - b) * a).name('expr'), '(`a` - `b`) * `a` AS `expr`')
        ]

        return self._check_expr_cases(cases, named=True)

    def test_binary_infix_operators(self):
        # For each function, verify that the generated code is what we expect
        a, b, h = self.table.get_columns(['a', 'b', 'h'])
        bool_col = a > 0

        cases = [
            (a + b, '`a` + `b`'),
            (a - b, '`a` - `b`'),
            (a * b, '`a` * `b`'),
            (a / b, '`a` / `b`'),
            (a ** b, 'pow(`a`, `b`)'),
            (a < b, '`a` < `b`'),
            (a <= b, '`a` <= `b`'),
            (a > b, '`a` > `b`'),
            (a >= b, '`a` >= `b`'),
            (a == b, '`a` = `b`'),
            (a != b, '`a` != `b`'),
            (h & bool_col, '`h` AND (`a` > 0)'),
            (h | bool_col, '`h` OR (`a` > 0)'),
            # xor is brute force
            (h ^ bool_col, '(`h` OR (`a` > 0)) AND NOT (`h` AND (`a` > 0))')
        ]
        self._check_expr_cases(cases)

    def test_binary_infix_parenthesization(self):
        a, b, c = self.table.get_columns(['a', 'b', 'c'])

        cases = [
            ((a + b) + c, '(`a` + `b`) + `c`'),
            (a.log() + c, 'ln(`a`) + `c`'),
            (b + (-(a + c)), '`b` + (-(`a` + `c`))')
        ]

        self._check_expr_cases(cases)

    def test_between(self):
        cases = [
            (self.table.f.between(0, 1), '`f` BETWEEN 0 AND 1')
        ]
        self._check_expr_cases(cases)

    def test_isnull_notnull(self):
        cases = [
            (self.table['g'].isnull(), '`g` IS NULL'),
            (self.table['a'].notnull(), '`a` IS NOT NULL'),
            ((self.table['a'] + self.table['b']).isnull(),
             '`a` + `b` IS NULL')
        ]
        self._check_expr_cases(cases)

    def test_casts(self):
        a, d, g = self.table.get_columns(['a', 'd', 'g'])
        cases = [
            (a.cast('int16'), 'CAST(`a` AS smallint)'),
            (a.cast('int32'), 'CAST(`a` AS int)'),
            (a.cast('int64'), 'CAST(`a` AS bigint)'),
            (a.cast('float'), 'CAST(`a` AS float)'),
            (a.cast('double'), 'CAST(`a` AS double)'),
            (a.cast('string'), 'CAST(`a` AS string)'),
            (d.cast('int8'), 'CAST(`d` AS tinyint)'),
            (g.cast('double'), 'CAST(`g` AS double)'),
            (g.cast('timestamp'), 'CAST(`g` AS timestamp)')
        ]
        self._check_expr_cases(cases)

    def test_misc_conditionals(self):
        a = self.table.a
        cases = [
            (a.nullif(0), 'nullif(`a`, 0)')
        ]
        self._check_expr_cases(cases)

    def test_decimal_casts(self):
        cases = [
            (L('9.9999999').cast('decimal(38,5)'),
             "CAST('9.9999999' AS decimal(38,5))"),
            (self.table.f.cast('decimal(12,2)'), "CAST(`f` AS decimal(12,2))")
        ]
        self._check_expr_cases(cases)

    def test_negate(self):
        cases = [
            (-self.table['a'], '-`a`'),
            (-self.table['f'], '-`f`'),
            (-self.table['h'], 'NOT `h`')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_extract_field(self):
        fields = ['year', 'month', 'day', 'hour', 'minute',
                  'second', 'millisecond']

        cases = [(getattr(self.table.i, field)(),
                  "extract(`i`, '{0}')".format(field))
                 for field in fields]
        self._check_expr_cases(cases)

        # integration with SQL translation
        expr = self.table[self.table.i.year().name('year'),
                          self.table.i.month().name('month'),
                          self.table.i.day().name('day')]

        result = to_sql(expr)
        expected = \
            """SELECT extract(`i`, 'year') AS `year`, extract(`i`, 'month') AS `month`,
       extract(`i`, 'day') AS `day`
FROM alltypes"""
        assert result == expected

    def test_timestamp_now(self):
        cases = [
            (ibis.now(), 'now()')
        ]
        self._check_expr_cases(cases)

    def test_timestamp_deltas(self):
        units = ['year', 'month', 'week', 'day',
                 'hour', 'minute', 'second',
                 'millisecond', 'microsecond']

        t = self.table.i
        f = '`i`'

        cases = []
        for unit in units:
            K = 5
            offset = getattr(ibis, unit)(K)
            template = '{0}s_add({1}, {2})'

            cases.append((t + offset, template.format(unit, f, K)))
            cases.append((t - offset, template.format(unit, f, -K)))

        self._check_expr_cases(cases)

    def test_timestamp_literals(self):
        from pandas import Timestamp

        tv1 = '2015-01-01 12:34:56'
        ex1 = ("'2015-01-01 12:34:56'")

        cases = [
            (L(Timestamp(tv1)), ex1),
            (L(Timestamp(tv1).to_pydatetime()), ex1),
            (ibis.timestamp(tv1), ex1)
        ]
        self._check_expr_cases(cases)

    def test_timestamp_from_integer(self):
        col = self.table.c

        cases = [
            (col.to_timestamp(),
             'CAST(from_unixtime(`c`, "yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('ms'),
             'CAST(from_unixtime(CAST(`c` / 1000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
            (col.to_timestamp('us'),
             'CAST(from_unixtime(CAST(`c` / 1000000 AS int), '
             '"yyyy-MM-dd HH:mm:ss") '
             'AS timestamp)'),
        ]
        self._check_expr_cases(cases)

    def test_correlated_predicate_subquery(self):
        t0 = self.table
        t1 = t0.view()

        expr = t0.g == t1.g

        ctx = ImpalaContext()
        ctx.make_alias(t0)

        # Grab alias from parent context
        subctx = ctx.subcontext()
        subctx.make_alias(t1)
        subctx.make_alias(t0)

        result = self._translate(expr, context=subctx)
        expected = "t0.`g` = t1.`g`"
        assert result == expected

    def test_any_all(self):
        t = self.table

        bool_expr = t.f == 0

        cases = [
            (bool_expr.any(), 'sum(`f` = 0) > 0'),
            (-bool_expr.any(), 'sum(`f` = 0) = 0'),
            (bool_expr.all(), 'sum(`f` = 0) = count(*)'),
            (-bool_expr.all(), 'sum(`f` = 0) < count(*)'),
        ]
        self._check_expr_cases(cases)
Пример #56
0
class TestTimestamp(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('alltypes')
        self.col = self.alltypes.i

    def test_field_select(self):
        assert isinstance(self.col, ir.TimestampArray)

    def test_string_cast_to_timestamp(self):
        casted = self.alltypes.g.cast('timestamp')
        assert isinstance(casted, ir.TimestampArray)

        string = api.literal('2000-01-01')
        casted = string.cast('timestamp')
        assert isinstance(casted, ir.TimestampScalar)

    def test_extract_fields(self):
        # type-size may be database specific
        cases = [
            ('year', ops.ExtractYear, ir.Int32Array),
            ('month', ops.ExtractMonth, ir.Int32Array),
            ('day', ops.ExtractDay, ir.Int32Array),
            ('hour', ops.ExtractHour, ir.Int32Array),
            ('minute', ops.ExtractMinute, ir.Int32Array),
            ('second', ops.ExtractSecond, ir.Int32Array),
            ('millisecond', ops.ExtractMillisecond, ir.Int32Array),
        ]

        for attr, ex_op, ex_type in cases:
            result = getattr(self.col, attr)()
            assert result.get_name() == attr
            assert isinstance(result, ex_type)
            assert isinstance(result.op(), ex_op)

    def test_now(self):
        result = api.now()
        assert isinstance(result, ir.TimestampScalar)
        assert isinstance(result.op(), ops.TimestampNow)

    def test_timestamp_literals(self):
        ts_str = '2015-01-01 00:00:00'
        val = pd.Timestamp(ts_str)

        expr = ibis.literal(val)
        assert isinstance(expr, ir.TimestampScalar)

        expr = ibis.timestamp(ts_str)
        assert isinstance(expr, ir.TimestampScalar)

        self.assertRaises(ValueError, ibis.timestamp, '2015-01-01 00:71')

    def test_integer_to_timestamp(self):
        # #246
        pass

    def test_comparison_timestamp(self):
        expr = self.col > (self.col.min() + ibis.day(3))
        assert isinstance(expr, ir.BooleanArray)

    def test_comparisons_string(self):
        val = '2015-01-01 00:00:00'
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)

        expr2 = val < self.col
        op = expr2.op()
        assert isinstance(op, ops.Greater)
        assert isinstance(op.right, ir.TimestampScalar)

    def test_comparisons_pandas_timestamp(self):
        val = pd.Timestamp('2015-01-01 00:00:00')
        expr = self.col > val
        op = expr.op()
        assert isinstance(op.right, ir.TimestampScalar)
Пример #57
0
class TestSelectSQL(unittest.TestCase):

    def setUp(self):
        self.con = MockConnection()

    def test_nameless_table(self):
        # Ensure that user gets some kind of sensible error
        nameless = api.table([('key', 'string')])
        self.assertRaises(com.RelationError, to_sql, nameless)

        with_name = api.table([('key', 'string')], name='baz')
        result = to_sql(with_name)
        assert result == 'SELECT *\nFROM baz'

    def test_physical_table_reference_translate(self):
        # If an expression's table leaves all reference database tables, verify
        # we translate correctly
        table = self.con.table('alltypes')

        query = _get_query(table)
        sql_string = query.compile()
        expected = "SELECT *\nFROM alltypes"
        assert sql_string == expected

    def test_simple_join_formatting(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        pred = t1['foo_id'] == t2['foo_id']
        pred2 = t1['bar_id'] == t2['foo_id']
        cases = [
            (t1.inner_join(t2, [pred])[[t1]],
             """SELECT t0.*
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            (t1.left_join(t2, [pred])[[t1]],
             """SELECT t0.*
FROM star1 t0
  LEFT OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            (t1.outer_join(t2, [pred])[[t1]],
             """SELECT t0.*
FROM star1 t0
  FULL OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""),
            # multiple predicates
            (t1.inner_join(t2, [pred, pred2])[[t1]],
             """SELECT t0.*
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id` AND
       t0.`bar_id` = t1.`foo_id`"""),
        ]

        for expr, expected_sql in cases:
            result_sql = to_sql(expr)
            assert result_sql == expected_sql

    def test_multiple_join_cases(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')
        t3 = self.con.table('star3')

        predA = t1['foo_id'] == t2['foo_id']
        predB = t1['bar_id'] == t3['bar_id']

        what = (t1.left_join(t2, [predA])
                .inner_join(t3, [predB])
                .projection([t1, t2['value1'], t3['value2']]))
        result_sql = to_sql(what)
        expected_sql = """SELECT t0.*, t1.`value1`, t2.`value2`
FROM star1 t0
  LEFT OUTER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`
  INNER JOIN star3 t2
    ON t0.`bar_id` = t2.`bar_id`"""
        assert result_sql == expected_sql

    def test_join_between_joins(self):
        t1 = api.table([
            ('key1', 'string'),
            ('key2', 'string'),
            ('value1', 'double'),
        ], 'first')

        t2 = api.table([
            ('key1', 'string'),
            ('value2', 'double'),
        ], 'second')

        t3 = api.table([
            ('key2', 'string'),
            ('key3', 'string'),
            ('value3', 'double'),
        ], 'third')

        t4 = api.table([
            ('key3', 'string'),
            ('value4', 'double')
        ], 'fourth')

        left = t1.inner_join(t2, [('key1', 'key1')])[t1, t2.value2]
        right = t3.inner_join(t4, [('key3', 'key3')])[t3, t4.value4]

        joined = left.inner_join(right, [('key2', 'key2')])

        # At one point, the expression simplification was resulting in bad refs
        # here (right.value3 referencing the table inside the right join)
        exprs = [left, right.value3, right.value4]
        projected = joined.projection(exprs)

        result = to_sql(projected)
        expected = """SELECT t0.*, t1.`value3`, t1.`value4`
FROM (
  SELECT t2.*, t3.`value2`
  FROM `first` t2
    INNER JOIN second t3
      ON t2.`key1` = t3.`key1`
) t0
  INNER JOIN (
    SELECT t2.*, t3.`value4`
    FROM third t2
      INNER JOIN fourth t3
        ON t2.`key3` = t3.`key3`
  ) t1
    ON t0.`key2` = t1.`key2`"""
        assert result == expected

    def test_join_just_materialized(self):
        t1 = self.con.table('tpch_nation')
        t2 = self.con.table('tpch_region')
        t3 = self.con.table('tpch_customer')

        # GH #491
        joined = (t1.inner_join(t2, t1.n_regionkey == t2.r_regionkey)
                  .inner_join(t3, t1.n_nationkey == t3.c_nationkey))
        result = to_sql(joined)
        expected = """SELECT *
FROM tpch_nation t0
  INNER JOIN tpch_region t1
    ON t0.`n_regionkey` = t1.`r_regionkey`
  INNER JOIN tpch_customer t2
    ON t0.`n_nationkey` = t2.`c_nationkey`"""
        assert result == expected

        result = to_sql(joined.materialize())
        assert result == expected

    def test_join_no_predicates_for_impala(self):
        # Impala requires that joins without predicates be written explicitly
        # as CROSS JOIN, since result sets can accidentally get too large if a
        # query is executed before predicates are written
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined2 = t1.cross_join(t2)[[t1]]

        expected = """SELECT t0.*
FROM star1 t0
  CROSS JOIN star2 t1"""
        result2 = to_sql(joined2)
        assert result2 == expected

        for jtype in ['inner_join', 'left_join', 'outer_join']:
            joined = getattr(t1, jtype)(t2)[[t1]]

            result = to_sql(joined)
            assert result == expected

    def test_semi_anti_joins(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined = t1.semi_join(t2, [t1.foo_id == t2.foo_id])[[t1]]

        result = to_sql(joined)
        expected = """SELECT t0.*
FROM star1 t0
  LEFT SEMI JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

        joined = t1.anti_join(t2, [t1.foo_id == t2.foo_id])[[t1]]
        result = to_sql(joined)
        expected = """SELECT t0.*
FROM star1 t0
  LEFT ANTI JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

    def test_self_reference_simple(self):
        t1 = self.con.table('star1')

        result_sql = to_sql(t1.view())
        expected_sql = "SELECT *\nFROM star1"
        assert result_sql == expected_sql

    def test_join_self_reference(self):
        t1 = self.con.table('star1')
        t2 = t1.view()

        result = t1.inner_join(t2, [t1.foo_id == t2.bar_id])[[t1]]

        result_sql = to_sql(result)
        expected_sql = """SELECT t0.*
FROM star1 t0
  INNER JOIN star1 t1
    ON t0.`foo_id` = t1.`bar_id`"""
        assert result_sql == expected_sql

    def test_join_projection_subquery_broken_alias(self):
        # From an observed bug, derived from tpch tables
        geo = (nation.inner_join(region, [('n_regionkey', 'r_regionkey')])
               [nation.n_nationkey,
                nation.n_name.name('nation'),
                region.r_name.name('region')])

        expr = (geo.inner_join(customer, [('n_nationkey', 'c_nationkey')])
                [customer, geo])

        result = to_sql(expr)
        expected = """SELECT t1.*, t0.*
FROM (
  SELECT t2.`n_nationkey`, t2.`n_name` AS `nation`, t3.`r_name` AS `region`
  FROM nation t2
    INNER JOIN region t3
      ON t2.`n_regionkey` = t3.`r_regionkey`
) t0
  INNER JOIN customer t1
    ON t0.`n_nationkey` = t1.`c_nationkey`"""
        assert result == expected

    def test_where_simple_comparisons(self):
        t1 = self.con.table('star1')

        what = t1.filter([t1.f > 0, t1.c < t1.f * 2])

        result = to_sql(what)
        expected = """SELECT *
FROM star1
WHERE `f` > 0 AND
      `c` < (`f` * 2)"""
        assert result == expected

    def test_where_in_array_literal(self):
        # e.g.
        # where string_col in (v1, v2, v3)
        raise unittest.SkipTest

    def test_where_with_join(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        # This also tests some cases of predicate pushdown
        what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])
                .projection([t1, t2.value1, t2.value3])
                .filter([t1.f > 0, t2.value3 < 1000]))

        what2 = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])
                 .filter([t1.f > 0, t2.value3 < 1000])
                 .projection([t1, t2.value1, t2.value3]))

        expected_sql = """SELECT t0.*, t1.`value1`, t1.`value3`
FROM star1 t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`
WHERE t0.`f` > 0 AND
      t1.`value3` < 1000"""

        result_sql = to_sql(what)
        assert result_sql == expected_sql

        result2_sql = to_sql(what2)
        assert result2_sql == expected_sql

    def test_where_no_pushdown_possible(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        joined = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])
                  [t1, (t1.f - t2.value1).name('diff')])

        filtered = joined[joined.diff > 1]

        # TODO: I'm not sure if this is exactly what we want
        expected_sql = """SELECT *
FROM (
  SELECT t0.*, t0.`f` - t1.`value1` AS `diff`
  FROM star1 t0
    INNER JOIN star2 t1
      ON t0.`foo_id` = t1.`foo_id`
  WHERE t0.`f` > 0 AND
        t1.`value3` < 1000
)
WHERE `diff` > 1"""

        raise unittest.SkipTest

        result_sql = to_sql(filtered)
        assert result_sql == expected_sql

    def test_where_with_between(self):
        t = self.con.table('alltypes')

        what = t.filter([t.a > 0, t.f.between(0, 1)])
        result = to_sql(what)
        expected = """SELECT *
FROM alltypes
WHERE `a` > 0 AND
      `f` BETWEEN 0 AND 1"""
        assert result == expected

    def test_where_analyze_scalar_op(self):
        # root cause of #310

        table = self.con.table('functional_alltypes')

        expr = (table.filter([table.timestamp_col <
                             (ibis.timestamp('2010-01-01') + ibis.month(3)),
                             table.timestamp_col < (ibis.now() +
                                                    ibis.day(10))])
                .count())

        result = to_sql(expr)
        expected = """\
SELECT count(*) AS `tmp`
FROM functional_alltypes
WHERE `timestamp_col` < months_add('2010-01-01 00:00:00', 3) AND
      `timestamp_col` < days_add(now(), 10)"""
        assert result == expected

    def test_simple_aggregate_query(self):
        t1 = self.con.table('star1')

        cases = [
            (t1.aggregate([t1['f'].sum().name('total')],
                          [t1['foo_id']]),
             """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1"""),
            (t1.aggregate([t1['f'].sum().name('total')],
                          ['foo_id', 'bar_id']),
             """SELECT `foo_id`, `bar_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1, 2""")
        ]
        for expr, expected_sql in cases:
            result_sql = to_sql(expr)
            assert result_sql == expected_sql

    def test_aggregate_having(self):
        # Filtering post-aggregation predicate
        t1 = self.con.table('star1')

        total = t1.f.sum().name('total')
        metrics = [total]

        expr = t1.aggregate(metrics, by=['foo_id'],
                            having=[total > 10])
        result = to_sql(expr)
        expected = """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1
HAVING sum(`f`) > 10"""
        assert result == expected

        expr = t1.aggregate(metrics, by=['foo_id'],
                            having=[t1.count() > 100])
        result = to_sql(expr)
        expected = """SELECT `foo_id`, sum(`f`) AS `total`
FROM star1
GROUP BY 1
HAVING count(*) > 100"""
        assert result == expected

    def test_aggregate_table_count_metric(self):
        expr = self.con.table('star1').count()

        result = to_sql(expr)
        expected = """SELECT count(*) AS `tmp`
FROM star1"""
        assert result == expected

        # count on more complicated table
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        join_expr = region.r_regionkey == nation.n_regionkey
        joined = region.inner_join(nation, join_expr)
        table_ref = joined[nation, region.r_name.name('region')]

        expr = table_ref.count()
        result = to_sql(expr)
        expected = """SELECT count(*) AS `tmp`
FROM (
  SELECT t2.*, t1.`r_name` AS `region`
  FROM tpch_region t1
    INNER JOIN tpch_nation t2
      ON t1.`r_regionkey` = t2.`n_regionkey`
) t0"""
        assert result == expected

    def test_expr_template_field_name_binding(self):
        # Given an expression with no concrete links to actual database tables,
        # indicate a mapping between the distinct unbound table leaves of the
        # expression and some database tables with compatible schemas but
        # potentially different column names
        pass

    def test_no_aliases_needed(self):
        table = api.table([
            ('key1', 'string'),
            ('key2', 'string'),
            ('value', 'double')
        ])

        expr = table.aggregate([table['value'].sum().name('total')],
                               by=['key1', 'key2'])

        query = _get_query(expr)
        context = query.context
        assert not context.need_aliases()

    def test_table_names_overlap_default_aliases(self):
        # see discussion in #104; this actually is not needed for query
        # correctness, and only makes the generated SQL nicer
        raise unittest.SkipTest

        t0 = api.table([
            ('key', 'string'),
            ('v1', 'double')
        ], 't1')

        t1 = api.table([
            ('key', 'string'),
            ('v2', 'double')
        ], 't0')

        expr = t0.join(t1, t0.key == t1.key)[t0.key, t0.v1, t1.v2]

        result = to_sql(expr)
        expected = """\
SELECT t2.`key`, t2.`v1`, t3.`v2`
FROM t0 t2
  INNER JOIN t1 t3
    ON t2.`key` = t3.`key`"""

        assert result == expected

    def test_context_aliases_multiple_join(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')
        t3 = self.con.table('star3')

        expr = (t1.left_join(t2, [t1['foo_id'] == t2['foo_id']])
                .inner_join(t3, [t1['bar_id'] == t3['bar_id']])
                [[t1, t2['value1'], t3['value2']]])

        query = _get_query(expr)
        context = query.context

        assert context.get_alias(t1) == 't0'
        assert context.get_alias(t2) == 't1'
        assert context.get_alias(t3) == 't2'

    def test_fuse_projections(self):
        table = api.table([
            ('foo', 'int32'),
            ('bar', 'int64'),
            ('value', 'double')
        ], name='tbl')

        # Cases where we project in both cases using the base table reference
        f1 = (table['foo'] + table['bar']).name('baz')
        pred = table['value'] > 0

        table2 = table[table, f1]
        table2_filtered = table2[pred]

        f2 = (table2['foo'] * 2).name('qux')
        f3 = (table['foo'] * 2).name('qux')

        table3 = table2.projection([table2, f2])

        # fusion works even if there's a filter
        table3_filtered = table2_filtered.projection([table2, f2])

        expected = table[table, f1, f3]
        expected2 = table[pred][table, f1, f3]

        assert table3.equals(expected)
        assert table3_filtered.equals(expected2)

        ex_sql = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux`
FROM tbl"""

        ex_sql2 = """SELECT *, `foo` + `bar` AS `baz`, `foo` * 2 AS `qux`
FROM tbl
WHERE `value` > 0"""

        table3_sql = to_sql(table3)
        table3_filt_sql = to_sql(table3_filtered)

        assert table3_sql == ex_sql
        assert table3_filt_sql == ex_sql2

        # Use the intermediate table refs
        table3 = table2.projection([table2, f2])

        # fusion works even if there's a filter
        table3_filtered = table2_filtered.projection([table2, f2])

        expected = table[table, f1, f3]
        expected2 = table[pred][table, f1, f3]

        assert table3.equals(expected)
        assert table3_filtered.equals(expected2)

    def test_bug_project_multiple_times(self):
        # 108
        customer = self.con.table('tpch_customer')
        nation = self.con.table('tpch_nation')
        region = self.con.table('tpch_region')

        joined = (
            customer.inner_join(nation,
                                [customer.c_nationkey == nation.n_nationkey])
            .inner_join(region,
                        [nation.n_regionkey == region.r_regionkey])
        )
        proj1 = [customer, nation.n_name, region.r_name]
        step1 = joined[proj1]

        topk_by = step1.c_acctbal.cast('double').sum()
        pred = step1.n_name.topk(10, by=topk_by)

        proj_exprs = [step1.c_name, step1.r_name, step1.n_name]
        step2 = step1[pred]
        expr = step2.projection(proj_exprs)

        # it works!
        result = to_sql(expr)
        expected = """\
SELECT `c_name`, `r_name`, `n_name`
FROM (
  SELECT t1.*, t2.`n_name`, t3.`r_name`
  FROM tpch_customer t1
    INNER JOIN tpch_nation t2
      ON t1.`c_nationkey` = t2.`n_nationkey`
    INNER JOIN tpch_region t3
      ON t2.`n_regionkey` = t3.`r_regionkey`
    LEFT SEMI JOIN (
      SELECT t2.`n_name`, sum(CAST(t1.`c_acctbal` AS double)) AS `sum`
      FROM tpch_customer t1
        INNER JOIN tpch_nation t2
          ON t1.`c_nationkey` = t2.`n_nationkey`
        INNER JOIN tpch_region t3
          ON t2.`n_regionkey` = t3.`r_regionkey`
      GROUP BY 1
      ORDER BY `sum` DESC
      LIMIT 10
    ) t4
      ON t2.`n_name` = t4.`n_name`
) t0"""
        assert result == expected

    def test_aggregate_projection_subquery(self):
        t = self.con.table('alltypes')

        proj = t[t.f > 0][t, (t.a + t.b).name('foo')]

        def agg(x):
            return x.aggregate([x.foo.sum().name('foo total')], by=['g'])

        # predicate gets pushed down
        filtered = proj[proj.g == 'bar']

        result = to_sql(filtered)
        expected = """SELECT *, `a` + `b` AS `foo`
FROM alltypes
WHERE `f` > 0 AND
      `g` = 'bar'"""
        assert result == expected

        agged = agg(filtered)
        result = to_sql(agged)
        expected = """SELECT `g`, sum(`foo`) AS `foo total`
FROM (
  SELECT *, `a` + `b` AS `foo`
  FROM alltypes
  WHERE `f` > 0 AND
        `g` = 'bar'
) t0
GROUP BY 1"""
        assert result == expected

        # Pushdown is not possible (in Impala, Postgres, others)
        agged2 = agg(proj[proj.foo < 10])

        result = to_sql(agged2)
        expected = """SELECT t0.`g`, sum(t0.`foo`) AS `foo total`
FROM (
  SELECT *, `a` + `b` AS `foo`
  FROM alltypes
  WHERE `f` > 0
) t0
WHERE t0.`foo` < 10
GROUP BY 1"""
        assert result == expected

    def test_subquery_aliased(self):
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        agged = t1.aggregate([t1.f.sum().name('total')], by=['foo_id'])
        what = (agged.inner_join(t2, [agged.foo_id == t2.foo_id])
                [agged, t2.value1])

        result = to_sql(what)
        expected = """SELECT t0.*, t1.`value1`
FROM (
  SELECT `foo_id`, sum(`f`) AS `total`
  FROM star1
  GROUP BY 1
) t0
  INNER JOIN star2 t1
    ON t0.`foo_id` = t1.`foo_id`"""
        assert result == expected

    def test_double_nested_subquery_no_aliases(self):
        # We don't require any table aliasing anywhere
        t = api.table([
            ('key1', 'string'),
            ('key2', 'string'),
            ('key3', 'string'),
            ('value', 'double')
        ], 'foo_table')

        agg1 = t.aggregate([t.value.sum().name('total')],
                           by=['key1', 'key2', 'key3'])
        agg2 = agg1.aggregate([agg1.total.sum().name('total')],
                              by=['key1', 'key2'])
        agg3 = agg2.aggregate([agg2.total.sum().name('total')],
                              by=['key1'])

        result = to_sql(agg3)
        expected = """SELECT `key1`, sum(`total`) AS `total`
FROM (
  SELECT `key1`, `key2`, sum(`total`) AS `total`
  FROM (
    SELECT `key1`, `key2`, `key3`, sum(`value`) AS `total`
    FROM foo_table
    GROUP BY 1, 2, 3
  ) t1
  GROUP BY 1, 2
) t0
GROUP BY 1"""
        assert result == expected

    def test_aggregate_projection_alias_bug(self):
        # Observed in use
        t1 = self.con.table('star1')
        t2 = self.con.table('star2')

        what = (t1.inner_join(t2, [t1.foo_id == t2.foo_id])
                [[t1, t2.value1]])

        what = what.aggregate([what.value1.sum().name('total')],
                              by=[what.foo_id])

        # TODO: Not fusing the aggregation with the projection yet
        result = to_sql(what)
        expected = """SELECT `foo_id`, sum(`value1`) AS `total`
FROM (
  SELECT t1.*, t2.`value1`
  FROM star1 t1
    INNER JOIN star2 t2
      ON t1.`foo_id` = t2.`foo_id`
) t0
GROUP BY 1"""
        assert result == expected

    def test_aggregate_fuse_with_projection(self):
        # see above test case
        pass

    def test_subquery_used_for_self_join(self):
        # There could be cases that should look in SQL like
        # WITH t0 as (some subquery)
        # select ...
        # from t0 t1
        #   join t0 t2
        #     on t1.kind = t2.subkind
        # ...
        # However, the Ibis code will simply have an expression (projection or
        # aggregation, say) built on top of the subquery expression, so we need
        # to extract the subquery unit (we see that it appears multiple times
        # in the tree).
        t = self.con.table('alltypes')

        agged = t.aggregate([t.f.sum().name('total')], by=['g', 'a', 'b'])
        view = agged.view()
        metrics = [(agged.total - view.total).max().name('metric')]
        reagged = (agged.inner_join(view, [agged.a == view.b])
                   .aggregate(metrics, by=[agged.g]))

        result = to_sql(reagged)
        expected = """WITH t0 AS (
  SELECT `g`, `a`, `b`, sum(`f`) AS `total`
  FROM alltypes
  GROUP BY 1, 2, 3
)
SELECT t0.`g`, max(t0.`total` - t1.`total`) AS `metric`
FROM t0
  INNER JOIN t0 t1
    ON t0.`a` = t1.`b`
GROUP BY 1"""
        assert result == expected

    def test_subquery_factor_correlated_subquery(self):
        # #173, #183 and other issues
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        customer = self.con.table('tpch_customer')
        orders = self.con.table('tpch_orders')

        fields_of_interest = [customer,
                              region.r_name.name('region'),
                              orders.o_totalprice.name('amount'),
                              orders.o_orderdate
                              .cast('timestamp').name('odate')]

        tpch = (region.join(nation, region.r_regionkey == nation.n_regionkey)
                .join(customer, customer.c_nationkey == nation.n_nationkey)
                .join(orders, orders.o_custkey == customer.c_custkey)
                [fields_of_interest])

        # Self-reference + correlated subquery complicates things
        t2 = tpch.view()
        conditional_avg = t2[t2.region == tpch.region].amount.mean()
        amount_filter = tpch.amount > conditional_avg

        expr = tpch[amount_filter].limit(10)

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT t5.*, t1.`r_name` AS `region`, t3.`o_totalprice` AS `amount`,
         CAST(t3.`o_orderdate` AS timestamp) AS `odate`
  FROM tpch_region t1
    INNER JOIN tpch_nation t2
      ON t1.`r_regionkey` = t2.`n_regionkey`
    INNER JOIN tpch_customer t5
      ON t5.`c_nationkey` = t2.`n_nationkey`
    INNER JOIN tpch_orders t3
      ON t3.`o_custkey` = t5.`c_custkey`
)
SELECT t0.*
FROM t0
WHERE t0.`amount` > (
  SELECT avg(t4.`amount`) AS `tmp`
  FROM t0 t4
  WHERE t4.`region` = t0.`region`
)
LIMIT 10"""
        assert result == expected

    def test_self_join_subquery_distinct_equal(self):
        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')

        j1 = (region.join(nation, region.r_regionkey == nation.n_regionkey)
              [region, nation])

        j2 = (region.join(nation, region.r_regionkey == nation.n_regionkey)
              [region, nation].view())

        expr = (j1.join(j2, j1.r_regionkey == j2.r_regionkey)
                [j1.r_name, j2.n_name])

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT t2.*, t3.*
  FROM tpch_region t2
    INNER JOIN tpch_nation t3
      ON t2.`r_regionkey` = t3.`n_regionkey`
)
SELECT t0.`r_name`, t1.`n_name`
FROM t0
  INNER JOIN t0 t1
    ON t0.`r_regionkey` = t1.`r_regionkey`"""

        assert result == expected

    def test_limit_with_self_join(self):
        t = self.con.table('functional_alltypes')
        t2 = t.view()

        expr = t.join(t2, t.tinyint_col < t2.timestamp_col.minute()).count()

        # it works
        result = to_sql(expr)
        expected = """\
SELECT count(*) AS `tmp`
FROM functional_alltypes t0
  INNER JOIN functional_alltypes t1
    ON t0.`tinyint_col` < extract(t1.`timestamp_col`, 'minute')"""
        assert result == expected

    def test_cte_factor_distinct_but_equal(self):
        t = self.con.table('alltypes')
        tt = self.con.table('alltypes')

        expr1 = t.group_by('g').aggregate(t.f.sum().name('metric'))
        expr2 = tt.group_by('g').aggregate(tt.f.sum().name('metric')).view()

        expr = expr1.join(expr2, expr1.g == expr2.g)[[expr1]]

        result = to_sql(expr)
        expected = """\
WITH t0 AS (
  SELECT `g`, sum(`f`) AS `metric`
  FROM alltypes
  GROUP BY 1
)
SELECT t0.*
FROM t0
  INNER JOIN t0 t1
    ON t0.`g` = t1.`g`"""

        assert result == expected

    def test_tpch_self_join_failure(self):
        # duplicating the integration test here

        region = self.con.table('tpch_region')
        nation = self.con.table('tpch_nation')
        customer = self.con.table('tpch_customer')
        orders = self.con.table('tpch_orders')

        fields_of_interest = [
            region.r_name.name('region'),
            nation.n_name.name('nation'),
            orders.o_totalprice.name('amount'),
            orders.o_orderdate.cast('timestamp').name('odate')]

        joined_all = (
            region.join(nation, region.r_regionkey == nation.n_regionkey)
            .join(customer, customer.c_nationkey == nation.n_nationkey)
            .join(orders, orders.o_custkey == customer.c_custkey)
            [fields_of_interest])

        year = joined_all.odate.year().name('year')
        total = joined_all.amount.sum().cast('double').name('total')
        annual_amounts = (joined_all
                          .group_by(['region', year])
                          .aggregate(total))

        current = annual_amounts
        prior = annual_amounts.view()

        yoy_change = (current.total - prior.total).name('yoy_change')
        yoy = (current.join(prior, current.year == (prior.year - 1))
               [current.region, current.year, yoy_change])
        to_sql(yoy)

    def test_extract_subquery_nested_lower(self):
        # We may have a join between two tables requiring subqueries, and
        # buried inside these there may be a common subquery. Let's test that
        # we find it and pull it out to the top level to avoid repeating
        # ourselves.
        pass

    def test_subquery_in_filter_predicate(self):
        # E.g. comparing against some scalar aggregate value. See Ibis #43
        t1 = self.con.table('star1')

        pred = t1.f > t1.f.mean()
        expr = t1[pred]

        # This brought out another expression rewriting bug, since the filtered
        # table isn't found elsewhere in the expression.
        pred2 = t1.f > t1[t1.foo_id == 'foo'].f.mean()
        expr2 = t1[pred2]

        result = to_sql(expr)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT avg(`f`) AS `tmp`
  FROM star1
)"""
        assert result == expected

        result = to_sql(expr2)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT avg(`f`) AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

    def test_filter_subquery_derived_reduction(self):
        t1 = self.con.table('star1')

        # Reduction can be nested inside some scalar expression
        pred3 = t1.f > t1[t1.foo_id == 'foo'].f.mean().log()
        pred4 = t1.f > (t1[t1.foo_id == 'foo'].f.mean().log() + 1)

        expr3 = t1[pred3]
        result = to_sql(expr3)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT ln(avg(`f`)) AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

        expr4 = t1[pred4]

        result = to_sql(expr4)
        expected = """SELECT *
FROM star1
WHERE `f` > (
  SELECT ln(avg(`f`)) + 1 AS `tmp`
  FROM star1
  WHERE `foo_id` = 'foo'
)"""
        assert result == expected

    def test_topk_operation_to_semi_join(self):
        # TODO: top K with filter in place

        table = api.table([
            ('foo', 'string'),
            ('bar', 'string'),
            ('city', 'string'),
            ('v1', 'double'),
            ('v2', 'double'),
        ], 'tbl')

        what = table.city.topk(10, by=table.v2.mean())
        filtered = table[what]

        query = to_sql(filtered)
        expected = """SELECT t0.*
FROM tbl t0
  LEFT SEMI JOIN (
    SELECT `city`, avg(`v2`) AS `mean`
    FROM tbl
    GROUP BY 1
    ORDER BY `mean` DESC
    LIMIT 10
  ) t1
    ON t0.`city` = t1.`city`"""
        assert query == expected

        # Test the default metric (count)

        what = table.city.topk(10)
        filtered2 = table[what]
        query = to_sql(filtered2)
        expected = """SELECT t0.*
FROM tbl t0
  LEFT SEMI JOIN (
    SELECT `city`, count(`city`) AS `count`
    FROM tbl
    GROUP BY 1
    ORDER BY `count` DESC
    LIMIT 10
  ) t1
    ON t0.`city` = t1.`city`"""
        assert query == expected

    def test_topk_predicate_pushdown_bug(self):
        # Observed on TPCH data
        cplusgeo = (
            customer.inner_join(nation, [customer.c_nationkey ==
                                         nation.n_nationkey])
                    .inner_join(region, [nation.n_regionkey ==
                                         region.r_regionkey])
            [customer, nation.n_name, region.r_name])

        pred = cplusgeo.n_name.topk(10, by=cplusgeo.c_acctbal.sum())
        expr = cplusgeo.filter([pred])

        result = to_sql(expr)
        expected = """\
SELECT t0.*, t1.`n_name`, t2.`r_name`
FROM customer t0
  INNER JOIN nation t1
    ON t0.`c_nationkey` = t1.`n_nationkey`
  INNER JOIN region t2
    ON t1.`n_regionkey` = t2.`r_regionkey`
  LEFT SEMI JOIN (
    SELECT t1.`n_name`, sum(t0.`c_acctbal`) AS `sum`
    FROM customer t0
      INNER JOIN nation t1
        ON t0.`c_nationkey` = t1.`n_nationkey`
      INNER JOIN region t2
        ON t1.`n_regionkey` = t2.`r_regionkey`
    GROUP BY 1
    ORDER BY `sum` DESC
    LIMIT 10
  ) t3
    ON t1.`n_name` = t3.`n_name`"""
        assert result == expected

    def test_topk_analysis_bug(self):
        # GH #398
        airlines = ibis.table([('dest', 'string'),
                               ('origin', 'string'),
                               ('arrdelay', 'int32')], 'airlines')

        dests = ['ORD', 'JFK', 'SFO']
        t = airlines[airlines.dest.isin(dests)]
        delay_filter = t.dest.topk(10, by=t.arrdelay.mean())
        expr = t[delay_filter].group_by('origin').size()

        result = to_sql(expr)
        expected = """\
SELECT t0.`origin`, count(*) AS `count`
FROM airlines t0
  LEFT SEMI JOIN (
    SELECT `dest`, avg(`arrdelay`) AS `mean`
    FROM airlines
    WHERE `dest` IN ('ORD', 'JFK', 'SFO')
    GROUP BY 1
    ORDER BY `mean` DESC
    LIMIT 10
  ) t1
    ON t0.`dest` = t1.`dest`
WHERE t0.`dest` IN ('ORD', 'JFK', 'SFO')
GROUP BY 1"""

        assert result == expected

    def test_topk_to_aggregate(self):
        t = ibis.table([('dest', 'string'),
                        ('origin', 'string'),
                        ('arrdelay', 'int32')], 'airlines')

        top = t.dest.topk(10, by=t.arrdelay.mean())

        result = to_sql(top)
        expected = to_sql(top.to_aggregation())
        assert result == expected

    def test_bottomk(self):
        pass

    def test_topk_antijoin(self):
        # Get the "other" category somehow
        pass

    def test_case_in_projection(self):
        t = self.con.table('alltypes')

        expr = (t.g.case()
                .when('foo', 'bar')
                .when('baz', 'qux')
                .else_('default').end())

        expr2 = (api.case()
                 .when(t.g == 'foo', 'bar')
                 .when(t.g == 'baz', t.g)
                 .end())

        proj = t[expr.name('col1'), expr2.name('col2'), t]

        result = to_sql(proj)
        expected = """SELECT
  CASE `g`
    WHEN 'foo' THEN 'bar'
    WHEN 'baz' THEN 'qux'
    ELSE 'default'
  END AS `col1`,
  CASE
    WHEN `g` = 'foo' THEN 'bar'
    WHEN `g` = 'baz' THEN `g`
    ELSE NULL
  END AS `col2`, *
FROM alltypes"""
        assert result == expected

    def test_identifier_quoting(self):
        data = api.table([
            ('date', 'int32'),
            ('explain', 'string')
        ], 'table')

        expr = data[data.date.name('else'), data.explain.name('join')]

        result = to_sql(expr)
        expected = """SELECT `date` AS `else`, `explain` AS `join`
FROM `table`"""
        assert result == expected
Пример #58
0
class TestBuiltins(unittest.TestCase):
    def setUp(self):
        self.con = MockConnection()
        self.alltypes = self.con.table('functional_alltypes')
        self.lineitem = self.con.table('tpch_lineitem')

    def test_abs(self):
        colnames = [
            'tinyint_col', 'smallint_col', 'int_col', 'bigint_col',
            'float_col', 'double_col'
        ]

        fname = 'abs'
        op = ops.Abs

        for col in colnames:
            expr = self.alltypes[col]
            self._check_unary_op(expr, fname, op, type(expr))

        expr = self.lineitem.l_extendedprice
        self._check_unary_op(expr, fname, op, type(expr))

    def test_group_concat(self):
        col = self.alltypes.string_col

        expr = col.group_concat()
        assert isinstance(expr.op(), ops.GroupConcat)
        arg, sep = expr.op().args
        assert sep == ','

        expr = col.group_concat('|')
        arg, sep = expr.op().args
        assert sep == '|'

    def test_zeroifnull(self):
        dresult = self.alltypes.double_col.zeroifnull()
        iresult = self.alltypes.int_col.zeroifnull()

        assert type(dresult.op()) == ops.ZeroIfNull
        assert type(dresult) == ir.DoubleArray

        # Impala upconverts all ints to bigint. Hmm.
        assert type(iresult) == type(iresult)

    def test_fillna(self):
        result = self.alltypes.double_col.fillna(5)
        assert isinstance(result, ir.DoubleArray)

        assert isinstance(result.op(), ops.IfNull)

        result = self.alltypes.bool_col.fillna(True)
        assert isinstance(result, ir.BooleanArray)

        # Retains type of caller (for now)
        result = self.alltypes.int_col.fillna(self.alltypes.bigint_col)
        assert isinstance(result, ir.Int32Array)

    def test_ceil_floor(self):
        cresult = self.alltypes.double_col.ceil()
        fresult = self.alltypes.double_col.floor()
        assert isinstance(cresult, ir.Int64Array)
        assert isinstance(fresult, ir.Int64Array)
        assert type(cresult.op()) == ops.Ceil
        assert type(fresult.op()) == ops.Floor

        cresult = ibis.literal(1.2345).ceil()
        fresult = ibis.literal(1.2345).floor()
        assert isinstance(cresult, ir.Int64Scalar)
        assert isinstance(fresult, ir.Int64Scalar)

        dec_col = self.lineitem.l_extendedprice
        cresult = dec_col.ceil()
        fresult = dec_col.floor()
        assert isinstance(cresult, ir.DecimalArray)
        assert cresult.meta == dec_col.meta

        assert isinstance(fresult, ir.DecimalArray)
        assert fresult.meta == dec_col.meta

    def test_sign(self):
        result = self.alltypes.double_col.sign()
        assert isinstance(result, ir.FloatArray)
        assert type(result.op()) == ops.Sign

        result = ibis.literal(1.2345).sign()
        assert isinstance(result, ir.FloatScalar)

        dec_col = self.lineitem.l_extendedprice
        result = dec_col.sign()
        assert isinstance(result, ir.FloatArray)

    def test_round(self):
        result = self.alltypes.double_col.round()
        assert isinstance(result, ir.Int64Array)
        assert result.op().args[1] is None

        result = self.alltypes.double_col.round(2)
        assert isinstance(result, ir.DoubleArray)
        assert result.op().args[1] == 2

        # Even integers are double (at least in Impala, check with other DB
        # implementations)
        result = self.alltypes.int_col.round(2)
        assert isinstance(result, ir.DoubleArray)

        dec = self.lineitem.l_extendedprice
        result = dec.round()
        assert isinstance(result, ir.DecimalArray)

        result = dec.round(2)
        assert isinstance(result, ir.DecimalArray)

        result = ibis.literal(1.2345).round()
        assert isinstance(result, ir.Int64Scalar)

    def _check_unary_op(self, expr, fname, ex_op, ex_type):
        result = getattr(expr, fname)()
        assert type(result.op()) == ex_op
        assert type(result) == ex_type
Пример #59
0
class TestUnaryBuiltins(unittest.TestCase, ExprSQLTest):

    def setUp(self):
        self.con = MockConnection()
        self.table = self.con.table('functional_alltypes')

    def test_numeric_unary_builtins(self):
        # No argument functions
        functions = ['abs', 'ceil', 'floor', 'exp', 'sqrt', 'sign',
                     ('log', 'ln'),
                     ('approx_median', 'appx_median'),
                     ('approx_nunique', 'ndv'),
                     'ln', 'log2', 'log10', 'nullifzero', 'zeroifnull']

        cases = []
        for what in functions:
            if isinstance(what, tuple):
                ibis_name, sql_name = what
            else:
                ibis_name = sql_name = what

            for cname in ['double_col', 'int_col']:
                expr = getattr(self.table[cname], ibis_name)()
                cases.append((expr, '{0}({1})'.format(
                    sql_name, '`{0}`'.format(cname))))

        self._check_expr_cases(cases)

    def test_log_other_bases(self):
        cases = [
            (self.table.double_col.log(5), 'log(`double_col`, 5)')
        ]
        self._check_expr_cases(cases)

    def test_round(self):
        cases = [
            (self.table.double_col.round(), 'round(`double_col`)'),
            (self.table.double_col.round(0), 'round(`double_col`, 0)'),
            (self.table.double_col.round(2, ), 'round(`double_col`, 2)'),
            (self.table.double_col.round(self.table.tinyint_col),
             'round(`double_col`, `tinyint_col`)')
        ]
        self._check_expr_cases(cases)

    def test_hash(self):
        expr = self.table.int_col.hash()
        assert isinstance(expr, ir.Int64Array)
        assert isinstance(self.table.int_col.sum().hash(),
                          ir.Int64Scalar)

        cases = [
            (self.table.int_col.hash(), 'fnv_hash(`int_col`)')
        ]
        self._check_expr_cases(cases)

    def test_reduction_where(self):
        cond = self.table.bigint_col < 70
        c = self.table.double_col
        tmp = ('{0}(CASE WHEN `bigint_col` < 70 THEN `double_col` '
               'ELSE NULL END)')
        cases = [
            (c.sum(where=cond), tmp.format('sum')),
            (c.count(where=cond), tmp.format('count')),
            (c.mean(where=cond), tmp.format('avg')),
            (c.max(where=cond), tmp.format('max')),
            (c.min(where=cond), tmp.format('min')),
            (c.std(where=cond), tmp.format('stddev')),
            (c.std(where=cond, how='pop'), tmp.format('stddev_pop')),
            (c.var(where=cond), tmp.format('variance')),
            (c.var(where=cond, how='pop'), tmp.format('variance_pop')),
        ]
        self._check_expr_cases(cases)

    def test_reduction_invalid_where(self):
        condbad_literal = L('T')
        c = self.table.double_col
        for reduction in [c.sum, c.count, c.mean, c.max, c.min]:
            with self.assertRaises(TypeError):
                reduction(where=condbad_literal)