def test_subterms(): a = Symbol('a', 'var * {x: int, y: int, z: int}') assert list(a._subterms()) == [a] assert set(a['x']._subterms()) == set([a, a['x']]) assert set(a['x'].map(inc, 'int')._subterms()) == \ set([a, a['x'], a['x'].map(inc, 'int')]) assert a in set((a['x'] + 1)._subterms())
def test_relabel_join(): names = Symbol('names', 'var * {first: string, last: string}') siblings = join(names.relabel({'last': 'left'}), names.relabel({'last': 'right'}), 'first') assert siblings.fields == ['first', 'left', 'right']
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] L = Symbol('L', 'var * {id: int, name: string, amount: real}') R = Symbol('R', 'var * {city: string, id: int}') assert set(compute(join(L, R), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='outer'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_by_columns(): t = Symbol('t', 'var * {name: string, amount: int32, id: int32}') assert len(by(t['id'], total=t['amount'].sum()).fields) == 2 assert len(by(t['id'], count=t['id'].count()).fields) == 2 print(by(t, count=t.count()).fields) assert len(by(t, count=t.count()).fields) == 4
def test_multi_column_join(): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] left = DataFrame(left, columns=['x', 'y', 'z']) right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] right = DataFrame(right, columns=['x', 'y', 'w']) L = Symbol('L', 'var * {x: int, y: int, z: int}') R = Symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)] expected = DataFrame(expected, columns=['x', 'y', 'z', 'w']) result = compute(j, {L: left, R: right}) print(result) assert str(result) == str(expected) assert list(result.columns) == list(j.fields)
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = Symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = Symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx:df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') expected = result_pandas.groupby('name')['node_id'].count().reset_index() assert str(result.values) == str(expected.values) assert list(result.columns) == ['name', 'node_id_count']
def test_distinct_name(): t = Symbol('t', 'var * {id: int32, name: string}') assert t.name.isidentical(t['name']) assert t.distinct().name.isidentical(t.distinct()['name']) assert t.id.distinct()._name == 'id' assert t.name._name == 'name'
def test_traverse(): t = Symbol('t', 'var * {name: string, amount: int}') assert t in list(t._traverse()) expr = t.amount.sum() trav = list(expr._traverse()) assert builtins.any(t.amount.isidentical(x) for x in trav)
def test_keepdims(): x = Symbol('x', '5 * 3 * float32') assert x.sum(axis=0, keepdims=True).dshape == dshape('1 * 3 * float32') assert x.sum(axis=1, keepdims=True).dshape == dshape('5 * 1 * float32') assert x.sum(axis=(0, 1), keepdims=True).dshape == dshape('1 * 1 * float32') assert x.std(axis=0, keepdims=True).shape == (1, 3)
def test_symbol_name(): t = Symbol('t', '10 * {people: string, amount: int}') r = Symbol('r', 'var * int64') with pytest.raises(AttributeError): t.name with pytest.raises(AttributeError): r.name
def test_Distinct(): x = np.array([('Alice', 100), ('Alice', -200), ('Bob', 100), ('Bob', 100)], dtype=[('name', 'S5'), ('amount', 'i8')]) t = Symbol('t', 'var * {name: string, amount: int64}') assert eq(compute(t['name'].distinct(), x), np.unique(x['name'])) assert eq(compute(t.distinct(), x), np.unique(x))
def test_nelements_array(): t = Symbol('t', '5 * 4 * 3 * float64') x = np.random.randn(*t.shape) result = compute(t.nelements(axis=(0, 1)), x) np.testing.assert_array_equal(result, np.array([20, 20, 20])) result = compute(t.nelements(axis=1), x) np.testing.assert_array_equal(result, 4 * np.ones((5, 3)))
def test_symbol_projection_failures(): t = Symbol('t', '10 * {name: string, amount: int}') with pytest.raises(ValueError): t._project(['name', 'id']) with pytest.raises(AttributeError): t.foo with pytest.raises(TypeError): t._project(t.dshape)
def test_Distinct(): t = Symbol('t', 'var * {name: string, amount: int32}') r = distinct(t['name']) print(r.dshape) assert r.dshape == dshape('var * string') assert r._name == 'name' r = t.distinct() assert r.dshape == t.dshape
def test_sort(): t = Symbol('t', 'var * {name: string, amount: int32, id: int32}') s = t.sort('amount', ascending=True) print(str(s)) assert eval(str(s)).isidentical(s) assert s.schema == t.schema assert t['amount'].sort().key == 'amount'
def test_numbers(): x = Symbol('x', 'real') y = Symbol('x', 'int') for expr in [x + 1, x - 1, x * 1, x + y, x - y, x / y, x * y + x + y, x**y, x**2, 2**x, x % 5, -x, sin(x), cos(x ** 2), exp(log(y))]: assert expr.dshape == dshape('real') assert eval(str(expr)) == expr assert (-y).dshape == dshape('int')
def test_like(): t = Symbol('t', 'var * {name: string, city: string}') data = [('Alice Smith', 'New York'), ('Bob Smith', 'Chicago'), ('Alice Walker', 'LA')] assert list(compute(t.like(name='Alice*'), data)) == [data[0], data[2]] assert list(compute(t.like(name='lice*'), data)) == [] assert list(compute(t.like(name='*Smith*'), data)) == [data[0], data[1]] assert list(compute(t.like(name='*Smith*', city='New York'), data)) == [data[0]]
def test_multi_column_join(): a = Symbol('a', 'var * {x: int, y: int, z: int}') b = Symbol('b', 'var * {w: int, x: int, y: int}') j = join(a, b, ['x', 'y']) assert set(j.fields) == set('wxyz') assert j.on_left == j.on_right == ['x', 'y'] assert hash(j) assert j.fields == ['x', 'y', 'z', 'w']
def test_length(): t = Symbol('t', '10 * {name: string, amount: int}') s = Symbol('s', 'var * {name:string, amount:int}') assert t.dshape == dshape('10 * {name: string, amount: int}') assert len(t) == 10 assert len(t.name) == 10 assert len(t[['name']]) == 10 assert len(t.sort('name')) == 10 assert len(t.head(5)) == 5 assert len(t.head(50)) == 10 with pytest.raises(ValueError): len(s)
def test_serializable(): t = Symbol('t', 'var * {id: int, name: string, amount: int}') import pickle t2 = pickle.loads(pickle.dumps(t)) assert t.isidentical(t2) s = Symbol('t', 'var * {id: int, city: string}') expr = join(t[t.amount < 0], s).sort('id').city.head() expr2 = pickle.loads(pickle.dumps(expr)) assert expr.isidentical(expr2)
def test_relabel_join(): names = Symbol('names', 'var * {first: string, last: string}') siblings = join(names.relabel({'first': 'left'}), names.relabel({'first': 'right'}), 'last')[['left', 'right']] data = [('Alice', 'Smith'), ('Bob', 'Jones'), ('Charlie', 'Smith')] print(set(compute(siblings, {names: data}))) assert ('Alice', 'Charlie') in set(compute(siblings, {names: data})) assert ('Alice', 'Bob') not in set(compute(siblings, {names: data}))
def test_leaves(): t = Symbol('t', 'var * {id: int32, name: string}') v = Symbol('v', 'var * {id: int32, city: string}') x = symbol('x', 'int32') assert t._leaves() == [t] assert t.id._leaves() == [t] assert by(t.name, count=t.id.nunique())._leaves() == [t] assert join(t, v)._leaves() == [t, v] assert join(v, t)._leaves() == [v, t] assert (x + 1)._leaves() == [x]
def test_relabel(): t = Symbol('t', 'var * {name: string, amount: int32, id: int32}') rl = t.relabel({'name': 'NAME', 'id': 'ID'}) rlc = t['amount'].relabel({'amount': 'BALANCE'}) assert eval(str(rl)).isidentical(rl) print(rl.fields) assert rl.fields == ['NAME', 'amount', 'ID'] assert not isscalar(rl.dshape.measure) assert isscalar(rlc.dshape.measure)
def test_multi_column_join(): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] L = Symbol('L', 'var * {x: int, y: int, z: int}') R = Symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) print(list(compute(j, {L: left, R: right}))) assert list(compute(j, { L: left, R: right })) == [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)]
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) L = Symbol('L', 'var * {id: int, name: string, amount: real}') R = Symbol('R', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert convert(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert convert(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')]) df = compute(join(L, R, how='right'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False)) df = compute(join(L, R, how='outer'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False))
def test_union(): L1 = [['Alice', 100, 1], ['Bob', 200, 2], ['Alice', 50, 3]] L2 = [['Alice', 100, 4], ['Bob', 200, 5], ['Alice', 50, 6]] L3 = [['Alice', 100, 7], ['Bob', 200, 8], ['Alice', 50, 9]] t1 = Symbol('t1', 'var * {name: string, amount: int, id: int}') t2 = Symbol('t2', 'var * {name: string, amount: int, id: int}') t3 = Symbol('t3', 'var * {name: string, amount: int, id: int}') expr = union(t1, t2, t3) result = list(compute(expr, {t1: L1, t2: L2, t3: L3})) assert result == L1 + L2 + L3
def test_merge(): t = Symbol('t', 'int64') p = Symbol('p', 'var * {amount:int}') accounts = Symbol('accounts', 'var * {name: string, balance: int32, id: int32}') new_amount = (accounts.balance * 1.5).label('new') c = merge(accounts[['name', 'balance']], new_amount) assert c.fields == ['name', 'balance', 'new'] assert c.schema == dshape('{name: string, balance: int32, new: float64}') with pytest.raises(ValueError): merge(t, t) with pytest.raises(ValueError): merge(t, p)
def test_by(): t = Symbol('t', 'var * {name: string, amount: int32, id: int32}') r = by(t['name'], total=sum(t['amount'])) print(r.schema) assert isinstance(r.schema[0], Record) assert str(r.schema[0]['name']) == 'string'
def test_relational(): t = Symbol('t', 'var * {name: string, amount: int, id: int}') r = (t['name'] == 'Alice') assert 'bool' in str(r.dshape) assert r._name
def test_by_multi_column_grouper(): t = Symbol('t', 'var * {x: int, y: int, z: int}') expr = by(t[['x', 'y']], t['z'].count()) data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)] print(set(compute(expr, data))) assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
def test_selection_by_getattr(): t = Symbol('t', 'var * {name: string, amount: int, id: int}') result = t[t.name == 'Alice'] assert t.schema == result.schema assert 'Alice' in str(result)
def test_datetime_comparison(): data = [['Alice', date(2000, 1, 1)], ['Bob', date(2000, 2, 2)], ['Alice', date(2000, 3, 3)]] t = Symbol('t', 'var * {name: string, when: date}') assert list(compute(t[t.when > '2000-01-01'], data)) == data[1:]
def test_dir(): t = Symbol('t', 'var * {name: string, amount: int, dt: datetime}') assert 'day' in dir(t.dt) assert 'mean' not in dir(t.dt) assert 'mean' in dir(t.amount) assert 'like' not in dir(t[['amount', 'dt']]) assert 'any' not in dir(t.name)
def test_common_subexpression(): a = Symbol('a', 'var * {x: int, y: int, z: int}') assert common_subexpression(a).isidentical(a) assert common_subexpression(a, a['x']).isidentical(a) assert common_subexpression(a['y'] + 1, a['x']).isidentical(a) assert common_subexpression(a['x'].map(inc, 'int'), a['x']).isidentical(a['x'])
def test_schema_of_complex_interaction(): a = Symbol('a', 'var * {x: int, y: int, z: int}') expr = (a['x'] + a['y']) / a['z'] assert expr.schema == dshape('float64') expr = expr.label('foo') assert expr.schema == dshape('float64')
def test_reduction(): t = Symbol('t', 'var * {name: string, amount: int32}') r = sum(t['amount']) assert r.dshape in (dshape('int64'), dshape('{amount: int64}'), dshape('{amount_sum: int64}')) assert 'amount' not in str(t.count().dshape) assert t.count().dshape[0] in (int32, int64) assert 'int' in str(t.count().dshape) assert 'int' in str(t.nunique().dshape) assert 'string' in str(t['name'].max().dshape) assert 'string' in str(t['name'].min().dshape) assert 'string' not in str(t.count().dshape) t = Symbol('t', 'var * {name: string, amount: real, id: int}') assert 'int' in str(t['id'].sum().dshape) assert 'int' not in str(t['amount'].sum().dshape)
import numpy as np from pandas import DataFrame import numpy as np import bcolz from datashape.predicates import isscalar, iscollection, isrecord from blaze.expr import Symbol, by from blaze.api import Data, into from blaze.compute import compute from blaze.expr.functions import sin, exp from blaze.sql import SQL sources = [] t = Symbol('t', 'var * {amount: int64, id: int64, name: string}') L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) x = into(np.ndarray, df) bc = into(bcolz.ctable, df) sql = SQL('sqlite:///:memory:', 'accounts', schema=t.schema) sql.extend(L)
def test_nelements_records(recdata): s = Symbol('s', discover(recdata)) assert compute(s.nelements(), recdata) == np.prod(recdata.shape) np.testing.assert_array_equal(compute(s.nelements(axis=0), recdata), np.zeros(recdata.shape[1]) + recdata.shape[0])
def test_count_nan(): t = Symbol('t', '3 * ?real') x = np.array([1.0, np.nan, 2.0]) assert compute(t.count(), x) == 2
def test_improper_selection(): t = Symbol('t', 'var * {x: int, y: int, z: int}') assert raises(Exception, lambda: t[t['x'] > 0][t.sort()[t['y' > 0]]])
def test_errors(): t = Symbol('t', 'var * {foo: int}') with raises(NotImplementedError): compute_up(by(t, t.count()), 1)
def test_summary_keepdims(): x = Symbol('x', '5 * 3 * float32') assert summary(a=x.min(), b=x.max()).dshape == \ dshape('{a: float32, b: float32}') assert summary(a=x.min(), b=x.max(), keepdims=True).dshape == \ dshape('1 * 1 * {a: float32, b: float32}')
def test_axis_kwarg_is_normalized_to_tuple(): x = Symbol('x', '5 * 3 * float32') exprs = [x.sum(), x.sum(axis=1), x.sum(axis=[1]), x.std(), x.mean(axis=1)] for expr in exprs: assert isinstance(expr.axis, tuple)
def test_reduction_dshape(): x = Symbol('x', '5 * 3 * float32') assert x.sum().dshape == x.schema assert x.sum(axis=0).dshape == dshape('3 * float32') assert x.sum(axis=1).dshape == dshape('5 * float32') assert x.sum(axis=(0, 1)).dshape == dshape('float32')
def test_count(): t = Symbol('t', '3 * int') assert compute(t.count(), [1, None, 2]) == 2
def test_head(): t = Symbol('t', 'var * {name: string, amount: int32, id: int32}') s = t.head(10) assert eval(str(s)).isidentical(s) assert s.schema == t.schema