def test_join(how): left = ibis.table([('a', 'int64'), ('b', 'string')]) right = ibis.table([('b', 'string'), ('c', 'int64')]) joined = left.join(right, left.b == right.b, how=how) result = joined[left.a, right.c] graph = viz.to_graph(result) assert key(result) in graph.source
def test_nested_join_multiple_ctes(): ratings = ibis.table( [ ('userid', 'int64'), ('movieid', 'int64'), ('rating', 'int8'), ('timestamp', 'string'), ], name='ratings', ) movies = ibis.table( [('movieid', 'int64'), ('title', 'string')], name='movies' ) expr = ratings.timestamp.cast('timestamp') ratings2 = ratings['userid', 'movieid', 'rating', expr.name('datetime')] joined2 = ratings2.join(movies, ['movieid'])[ratings2, movies['title']] joined3 = joined2.filter( [joined2.userid == 118205, joined2.datetime.year() > 2001] ) top_user_old_movie_ids = joined3.filter( [joined3.userid == 118205, joined3.datetime.year() < 2009] )[['movieid']] # projection from a filter was hiding an insidious bug, so we're disabling # that for now see issue #1295 cond = joined3.movieid.isin(top_user_old_movie_ids.movieid) result = joined3[cond] expected = """\ WITH t0 AS ( SELECT `userid`, `movieid`, `rating`, CAST(`timestamp` AS timestamp) AS `datetime` FROM ratings ), t1 AS ( SELECT t0.*, t5.`title` FROM t0 INNER JOIN movies t5 ON t0.`movieid` = t5.`movieid` ) SELECT t2.* FROM ( SELECT t1.* FROM t1 WHERE (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') > 2001) ) t2 WHERE t2.`movieid` IN ( SELECT `movieid` FROM ( SELECT t1.* FROM t1 WHERE (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') > 2001) AND (t1.`userid` = 118205) AND (extract(t1.`datetime`, 'year') < 2009) ) t4 )""" compiled_result = to_sql(result) assert compiled_result == expected
def test_join_project_after(self): # e.g. # # SELECT L.foo, L.bar, R.baz, R.qux # FROM table1 L # INNER JOIN table2 R # ON L.key = R.key # # or # # SELECT L.*, R.baz # ... # # The default for a join is selecting all fields if possible table1 = ibis.table([('key1', 'string'), ('value1', 'double')]) table2 = ibis.table([('key2', 'string'), ('stuff', 'int32')]) pred = table1['key1'] == table2['key2'] joined = table1.left_join(table2, [pred]) projected = joined.projection([table1, table2['stuff']]) assert projected.schema().names == ['key1', 'value1', 'stuff'] projected = joined.projection([table2, table1['key1']]) assert projected.schema().names == ['key2', 'stuff', 'key1']
def test_unravel_compound_equijoin(table): t1 = ibis.table( [ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value1', 'double'), ], 'foo_table', ) t2 = ibis.table( [ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value2', 'double'), ], 'bar_table', ) p1 = t1.key1 == t2.key1 p2 = t1.key2 == t2.key2 p3 = t1.key3 == t2.key3 joined = t1.inner_join(t2, [p1 & p2 & p3]) expected = t1.inner_join(t2, [p1, p2, p3]) assert_equal(joined, expected)
def test_format_multiple_join_with_projection(self): # Star schema with fact table table = ibis.table([ ('c', 'int32'), ('f', 'double'), ('foo_id', 'string'), ('bar_id', 'string'), ]) table2 = ibis.table([ ('foo_id', 'string'), ('value1', 'double') ]) table3 = ibis.table([ ('bar_id', 'string'), ('value2', 'double') ]) filtered = table[table['f'] > 0] pred1 = table['foo_id'] == table2['foo_id'] pred2 = filtered['bar_id'] == table3['bar_id'] j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields view = j2[[table, table2['value1'], table3['value2']]] # it works! repr(view)
def test_asof_join(): left = ibis.table([('time', 'int32'), ('value', 'double')]) right = ibis.table([('time', 'int32'), ('value2', 'double')]) right = right.mutate(foo=1) joined = api.asof_join(left, right, 'time') result = joined[left, right.foo] graph = viz.to_graph(result) assert key(result) in graph.source
def test_filter_join_unmaterialized(self): table1 = ibis.table({'key1': 'string', 'key2': 'string', 'value1': 'double'}) table2 = ibis.table({'key3': 'string', 'value2': 'double'}) # It works! joined = table1.inner_join(table2, [table1['key1'] == table2['key3']]) filtered = joined.filter([table1.value1 > 0]) repr(filtered)
def test_compile_with_one_unnamed_table(): t = ibis.table([('a', 'string')]) s = ibis.table([('b', 'string')], name='s') join = t.join(s, t.a == s.b) result = ibis.sqlite.compile(join) sqla_t = sa.table('t0', sa.column('a', sa.String)).alias('t0') sqla_s = sa.table('s', sa.column('b', sa.String)).alias('t1') sqla_join = sqla_t.join(sqla_s, sqla_t.c.a == sqla_s.c.b) expected = sa.select([sqla_t.c.a, sqla_s.c.b]).select_from(sqla_join) assert str(result) == str(expected)
def test_asof_join_with_by(): left = ibis.table( [('time', 'int32'), ('key', 'int32'), ('value', 'double')] ) right = ibis.table( [('time', 'int32'), ('key', 'int32'), ('value2', 'double')] ) joined = api.asof_join(left, right, 'time', by='key') by = joined.op().by[0].op() assert by.left.op().name == by.right.op().name == 'key'
def test_semi_join_schema(self): # A left semi join discards the schema of the right table table1 = ibis.table([('key1', 'string'), ('value1', 'double')]) table2 = ibis.table([('key2', 'string'), ('stuff', 'double')]) pred = table1['key1'] == table2['key2'] semi_joined = table1.semi_join(table2, [pred]).materialize() result_schema = semi_joined.schema() assert_equal(result_schema, table1.schema())
def setUp(self): self.con = MockConnection() self.t1 = ibis.table([ ('key1', 'string'), ('key2', 'string'), ('value1', 'double') ], 'foo') self.t2 = ibis.table([ ('key1', 'string'), ('key2', 'string') ], 'bar')
def test_multiple_join_deeper_reference(self): # Join predicates down the chain might reference one or more root # tables in the hierarchy. table1 = ibis.table({'key1': 'string', 'key2': 'string', 'value1': 'double'}) table2 = ibis.table({'key3': 'string', 'value2': 'double'}) table3 = ibis.table({'key4': 'string', 'value3': 'double'}) joined = table1.inner_join(table2, [table1['key1'] == table2['key3']]) joined2 = joined.inner_join(table3, [table1['key2'] == table3['key4']]) # it works, what more should we test here? materialized = joined2.materialize() repr(materialized)
def test_equijoin_schema_merge(self): table1 = ibis.table([('key1', 'string'), ('value1', 'double')]) table2 = ibis.table([('key2', 'string'), ('stuff', 'int32')]) pred = table1['key1'] == table2['key2'] join_types = ['inner_join', 'left_join', 'outer_join'] ex_schema = api.Schema(['key1', 'value1', 'key2', 'stuff'], ['string', 'double', 'string', 'int32']) for fname in join_types: f = getattr(table1, fname) joined = f(table2, [pred]).materialize() assert_equal(joined.schema(), ex_schema)
def test_column_ref_table_aliases(self): context = ImpalaContext() table1 = ibis.table([("key1", "string"), ("value1", "double")]) table2 = ibis.table([("key2", "string"), ("value and2", "double")]) context.set_ref(table1, "t0") context.set_ref(table2, "t1") expr = table1["value1"] - table2["value and2"] result = self._translate(expr, context=context) expected = "t0.`value1` - t1.`value and2`" assert result == expected
def test_join_overlapping_column_names(table): t1 = ibis.table( [('foo', 'string'), ('bar', 'string'), ('value1', 'double')] ) t2 = ibis.table( [('foo', 'string'), ('bar', 'string'), ('value2', 'double')] ) joined = t1.join(t2, 'foo') expected = t1.join(t2, t1.foo == t2.foo) assert_equal(joined, expected) joined = t1.join(t2, ['foo', 'bar']) expected = t1.join(t2, [t1.foo == t2.foo, t1.bar == t2.bar]) assert_equal(joined, expected)
def test_filter_with_analytic(): x = ibis.table(ibis.schema([('col', 'int32')]), 'x') with_filter_col = x[x.columns + [ibis.null().name('filter')]] filtered = with_filter_col[with_filter_col['filter'].isnull()] subquery = filtered[filtered.columns] with_analytic = subquery[['col', subquery.count().name('analytic')]] expr = with_analytic[with_analytic.columns] result = ibis.impala.compile(expr) expected = """\ SELECT `col`, `analytic` FROM ( SELECT `col`, count(*) OVER () AS `analytic` FROM ( SELECT `col`, `filter` FROM ( SELECT * FROM ( SELECT `col`, NULL AS `filter` FROM x ) t3 WHERE `filter` IS NULL ) t2 ) t1 ) t0""" assert result == expected
def test_filter_self_join(self): # GH #667 purchases = ibis.table([('region', 'string'), ('kind', 'string'), ('user', 'int64'), ('amount', 'double')], 'purchases') metric = purchases.amount.sum().name('total') agged = (purchases.group_by(['region', 'kind']) .aggregate(metric)) left = agged[agged.kind == 'foo'] right = agged[agged.kind == 'bar'] cond = left.region == right.region joined = left.join(right, cond) # unmodified by analysis assert_equal(joined.op().predicates[0], cond) metric = (left.total - right.total).name('diff') what = [left.region, metric] projected = joined.projection(what) proj_exprs = projected.op().selections # proj exprs unaffected by analysis assert_equal(proj_exprs[0], left.region) assert_equal(proj_exprs[1], metric)
def test_topk_analysis_bug(self): # GH #398 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') dests = ['ORD', 'JFK', 'SFO'] t = airlines[airlines.dest.isin(dests)] delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) expr = t[delay_filter].group_by('origin').size() result = to_sql(expr) expected = """\ SELECT t0.`origin`, count(*) AS `count` FROM airlines t0 LEFT SEMI JOIN ( SELECT `dest`, avg(`arrdelay`) AS `mean` FROM airlines WHERE `dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1 ORDER BY `mean` DESC LIMIT 10 ) t1 ON t0.`dest` = t1.`dest` WHERE t0.`dest` IN ('ORD', 'JFK', 'SFO') GROUP BY 1""" assert result == expected
def test_memoize_filtered_tables_in_join(): # related: GH #667 purchases = ibis.table( [ ('region', 'string'), ('kind', 'string'), ('user', 'int64'), ('amount', 'double'), ], 'purchases', ) metric = purchases.amount.sum().name('total') agged = purchases.group_by(['region', 'kind']).aggregate(metric) left = agged[agged.kind == 'foo'] right = agged[agged.kind == 'bar'] cond = left.region == right.region joined = left.join(right, cond)[left, right.total.name('right_total')] result = repr(joined) # Join, and one for each aggregation assert result.count('predicates') == 3
def setUp(self): self.table = ibis.table([ ('key1', 'string'), ('key2', 'string'), ('key3', 'string'), ('value', 'double') ], 'foo_table')
def test_sort_by(): t = ibis.table([('a', 'int64'), ('b', 'string'), ('c', 'int32')]) expr = ( t.groupby(t.b).aggregate(sum_a=t.a.sum().cast('double')).sort_by('c') ) graph = viz.to_graph(expr) assert key(expr) in graph.source
def test_join_with_external_table(con, alltypes, df): external_df = pd.DataFrame( [('alpha', 1, 'first'), ('beta', 2, 'second'), ('gamma', 3, 'third')], columns=['a', 'b', 'c'], ) external_df['b'] = external_df['b'].astype('int8') external_table = ibis.table( [('a', 'string'), ('b', 'int64'), ('c', 'string')], name='external' ) alltypes = alltypes.mutate(b=alltypes.tinyint_col) expr = alltypes.inner_join(external_table, ['b'])[ external_table.a, external_table.c, alltypes.id ] result = expr.execute(external_tables={'external': external_df}) expected = df.assign(b=df.tinyint_col).merge(external_df, on='b')[ ['a', 'c', 'id'] ] result = result.sort_values('id').reset_index(drop=True) expected = expected.sort_values('id').reset_index(drop=True) tm.assert_frame_equal(result, expected, check_column_type=False)
def test_nullable_column_propagated(): t = ibis.table( [ ('a', dt.Int32(nullable=True)), ('b', dt.Int32(nullable=False)), ('c', dt.String(nullable=False)), ('d', dt.double), # nullable by default ('f', dt.Double(nullable=False)), ] ) assert t.a.type().nullable is True assert t.b.type().nullable is False assert t.c.type().nullable is False assert t.d.type().nullable is True assert t.f.type().nullable is False s = t.a + t.d assert s.type().nullable is True s = t.b + t.d assert s.type().nullable is True s = t.b + t.f assert s.type().nullable is False
def test_union(table): schema1 = [('key', 'string'), ('value', 'double')] schema2 = [('key', 'string'), ('key2', 'string'), ('value', 'double')] t1 = ibis.table(schema1, 'foo') t2 = ibis.table(schema1, 'bar') t3 = ibis.table(schema2, 'baz') result = t1.union(t2) assert isinstance(result.op(), ops.Union) assert not result.op().distinct result = t1.union(t2, distinct=True) assert isinstance(result.op(), ops.Union) assert result.op().distinct with pytest.raises(RelationError): t1.union(t3)
def region(): return ibis.table( [ ('r_regionkey', 'int16'), ('r_name', 'string'), ('r_comment', 'string'), ], name='tpch_region', )
def test_compile_toplevel(): t = ibis.table([('foo', 'double')], name='t0') # it works! expr = t.foo.sum() result = ibis.postgres.compile(expr) expected = "SELECT sum(t0.foo) AS sum \nFROM t0 AS t0" # noqa assert str(result) == expected
def test_topk_function_late_bind(self): # GH #520 airlines = ibis.table([('dest', 'string'), ('origin', 'string'), ('arrdelay', 'int32')], 'airlines') expr1 = airlines.dest.topk(5, by=lambda x: x.arrdelay.mean()) expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean()) assert_equal(expr1.to_aggregation(), expr2.to_aggregation())
def setUp(self): self.schema = _all_types_schema self.schema_dict = dict(self.schema) self.table = ibis.table(self.schema, 'schema') self.int_cols = ['a', 'b', 'c', 'd'] self.bool_cols = ['h'] self.float_cols = ['e', 'f'] self.con = MockConnection()
def test_distinct_unnamed_array_expr(): table = ibis.table([("year", "int32"), ("month", "int32"), ("day", "int32")], "foo") # it works! expr = ( ibis.literal("-") .join([table.year.cast("string"), table.month.cast("string"), table.day.cast("string")]) .distinct() ) repr(expr)
def test_between(): t = ibis.table([('a', 'int64'), ('b', 'string'), ('c', 'int32')]) expr = t.a.between(1, 1) lower_bound, upper_bound = expr.op().args[1:] graph = viz.to_graph(expr) source = graph.source # one for the node itself and one for the edge to between assert key(lower_bound, 'lower_bound') in source assert key(upper_bound, 'upper_bound') in source
def test_group_by_key_function(): t = ibis.table([('a', 'timestamp'), ('b', 'string'), ('c', 'double')]) expr = t.groupby(new_key=lambda t: t.b.length()).aggregate(foo=t.c.mean()) assert expr.columns == ['new_key', 'foo']
def test_argument_repr_shows_name(): t = ibis.table([('fakecolname1', 'int64')], name='fakename2') expr = t.fakecolname1.nullif(2) result = repr(expr) assert 'fakecolname1' in result assert 'fakename2' in result
def t1(): return ibis.table([('key1', 'string'), ('key2', 'string'), ('value1', 'double')], 'foo')
def test_asof_join(): left = ibis.table([('time', 'int32'), ('value', 'double')]) right = ibis.table([('time', 'int32'), ('value2', 'double')]) joined = api.asof_join(left, right, 'time') pred = joined.op().predicates[0].op() assert pred.left.op().name == pred.right.op().name == 'time'
def t2(): return ibis.table([('key1', 'string'), ('key2', 'string')], 'bar')
def table(self, name: str, schema: Any = None): return ibis.table(self._schemas[name], name=name)
def test_table_operations_with_integer_column(position, names, expr_func): t = ibis.table([('foo', 'string'), ('bar', 'double')]) result = expr_func(t, position) expected = expr_func(t, names) assert result.equals(expected)
def test_null_column_union(): s = ibis.table([('a', 'string'), ('b', 'double')]) t = ibis.table([('a', 'string')]) with pytest.raises(ibis.common.exceptions.RelationError): s.union(t.mutate(b=ibis.NA)) # needs a type assert s.union(t.mutate(b=ibis.NA.cast('double'))).schema() == s.schema()
def test_not_without_boolean(typ): t = ibis.table([('a', typ)], name='t') c = t.a with pytest.raises(TypeError): ~c
def test_null_column(): t = ibis.table([('a', 'string')], name='t') s = t.mutate(b=ibis.NA) assert s.b.type() == dt.null assert isinstance(s.b, ir.NullColumn)
def test_repr_list_of_lists_in_table(): t = ibis.table([('a', 'int64')], name='t') lit = ibis.literal([[1]]) expr = t[t, lit.name('array_of_array')] repr(expr)
def test_column_isin_map_keys(): t = ibis.table([('a', 'string')], name='t') mapping = ibis.literal({'a': 1, 'b': 2}) expr = t.a.isin(mapping.keys()) assert isinstance(expr, ir.BooleanColumn)
def test_html_escape(with_graphviz): # Check that we correctly escape HTML <> characters in the graphviz # representation. If an error is thrown, _repr_png_ returns None. expr = ibis.table([('<a & b>', ibis.expr.datatypes.Array('string'))]) assert expr._repr_png_() is not None
def test_decimal_modulo_output_type(value, type, expected_type_class): t = ibis.table([('a', type)]) expr = t.a % value assert isinstance(expr.type(), expected_type_class)
def test_sort_by(): t = ibis.table([('a', 'int64'), ('b', 'string'), ('c', 'int32')]) expr = (t.groupby( t.b).aggregate(sum_a=t.a.sum().cast('double')).sort_by('b')) graph = viz.to_graph(expr) assert key(expr) in graph.source
def test_join_table_choice(): # GH807 x = ibis.table(ibis.schema([('n', 'int64')]), 'x') t = x.aggregate(cnt=x.n.count()) predicate = t.cnt > 0 assert L.sub_for(predicate, [(t, t.op().table)]).equals(predicate)
def test_missing_data_sources(): t = ibis.table([('a', 'string')]) expr = t.a.length() with pytest.raises(com.UnboundExpressionError): ibis.pandas.execute(expr)
def test_nullable_non_nullable_field(): t = ibis.table([('a', dt.String(nullable=False))]) assert nullable(t.a.type()) == ()
def test_pickle_table_expr(): schema = [('time', 'timestamp'), ('key', 'string'), ('value', 'double')] t0 = ibis.table(schema, name='t0') raw = pickle.dumps(t0) t1 = pickle.loads(raw) assert t1.equals(t0)
def draw(graph, path=None, format='png'): piped_source = graph.pipe(format=format) if path is None: with tempfile.NamedTemporaryFile(delete=False, suffix='.{}'.format(format), mode='wb') as f: f.write(piped_source) return f.name else: with open(path, mode='wb') as f: f.write(piped_source) return path if __name__ == '__main__': t = ibis.table([('a', 'int64'), ('b', 'double'), ('c', 'string')], name='t') left = ibis.table([('a', 'int64'), ('b', 'string')]) right = ibis.table([('b', 'string'), ('c', 'int64'), ('d', 'string')]) joined = left.inner_join(right, left.b == right.b) df = joined[left.a, right.c.name('b'), right.d.name('c')] a = df.a b = df.b filt = df[(a + b * 2 * b / b**3 > 4) & (b > 5)] expr = filt.groupby(filt.c).aggregate(amean=filt.a.mean(), bsum=filt.b.sum()) expr.visualize()
def test_compile_with_unnamed_table(): t = ibis.table([('a', 'string')]) result = ibis.sqlite.compile(t.a) st = sa.table(t.op().name, sa.column('a', sa.String)).alias('t0') assert str(result) == str(sa.select([st.c.a]))
assert t.f.type().nullable is False s = t.a + t.d assert s.type().nullable is True s = t.b + t.d assert s.type().nullable is True s = t.b + t.f assert s.type().nullable is False @pytest.mark.parametrize( 'base_expr', [ ibis.table([('interval_col', dt.Interval(unit='D'))]).interval_col, ibis.interval(seconds=42), ], ) def test_interval_negate(base_expr): expr = -base_expr expr2 = base_expr.negate() expr3 = ibis.negate(base_expr) assert isinstance(expr.op(), ops.Negate) assert expr.equals(expr2) assert expr.equals(expr3) def test_large_timestamp(): expr = ibis.timestamp('4567-02-03') expected = datetime(year=4567, month=2, day=3)
def test_nullable(): t = ibis.table([('a', 'int64')]) assert nullable(t.a.type()) == (type(None), )
def test_struct_field_dir(): t = ibis.table([('struct_col', 'struct<my_field: string>')]) assert 'struct_col' in dir(t) assert 'my_field' in dir(t.struct_col)
def test_unbound_table_name(): t = ibis.table([('a', 'timestamp')]) name = t.op().name match = re.match(r'^unbound_table_\d+$', name) assert match is not None
def test_identifier_quoting(self): schema = [('date', 'double'), ('table', 'string')] table = ibis.table(schema) self._translate(table['date'], '`date`') self._translate(table['table'], '`table`')
def test_column_ref_quoting(self): schema = [('has a space', 'double')] table = ibis.table(schema) self._translate(table['has a space'], '`has a space`')
import six import enum import ibis import pytest from toolz import identity from ibis.common import IbisTypeError import ibis.expr.types as ir import ibis.expr.rules as rlz import ibis.expr.datatypes as dt table = ibis.table([ ('int_col', 'int64'), ('string_col', 'string'), ('double_col', 'double'), ]) @pytest.mark.parametrize(('value', 'expected'), [ (dt.int32, dt.int32), ('int64', dt.int64), ('array<string>', dt.Array(dt.string)), ]) def test_valid_datatype(value, expected): assert rlz.datatype(value) == expected @pytest.mark.parametrize(('value', 'expected'), [('exception', IbisTypeError), ('array<cat>', IbisTypeError), (int, IbisTypeError),
def test_map_getitem_broadcast(): t = ibis.table([('a', 'string')], name='t') lookup_table = ibis.literal({'a': 1, 'b': 2}) expr = lookup_table[t.a] assert isinstance(expr, ir.IntegerColumn)
def table(schema): return ibis.table(schema, name='schema')