def test_dir(): i = symbol('i', '10 * int') d = symbol('d', '10 * datetime') assert isinstance(i + 1, Add) # this works with pytest.raises(Exception): # this doesn't d + 1
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') assert set(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='outer'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_multi_dataset_broadcast_with_Record_types(): x = symbol('x', '3 * {p: int, q: int}') y = symbol('y', '3 * int') a = [(1, 1), (2, 2), (3, 3)] b = [10, 20, 30] assert list(compute(x.p + x.q + y, {x: iter(a), y: iter(b)})) == [12, 24, 36]
def test_compute_signature(): s = symbol('s', 'int64') t = symbol('t', 'float32') d = symbol('d', 'datetime') assert compute_signature(s + t) == float64(int64, float32) assert (compute_signature(d.truncate(days=1)) == datetime64('D')(datetime64('us'))) assert compute_signature(d.day + 1) == int64(datetime64('us'))
def test_multi_dataset_broadcast(): x = symbol('x', '3 * int') y = symbol('y', '3 * int') a = [1, 2, 3] b = [10, 20, 30] assert list(compute(x + y, {x: a, y: b})) == [11, 22, 33] assert list(compute(2*x + (y + 1), {x: a, y: b})) == [13, 25, 37]
def test_truncate_datetime(): s = symbol('x', 'datetime') assert compute(s.truncate(2, 'days'), datetime(2002, 1, 3, 12, 30)) ==\ date(2002, 1, 2) s = symbol('x', 'var * datetime') assert list(compute(s.truncate(2, 'days'), [datetime(2002, 1, 3, 12, 30)])) ==\ [date(2002, 1, 2)]
def test_selection_inner_inputs(): s_data = pd.DataFrame({'a': np.arange(5)}) t_data = pd.DataFrame({'a': np.arange(5)}) s_dd = dd.from_pandas(s_data, npartitions=2) t_dd = dd.from_pandas(t_data, npartitions=2) s = symbol('s', 'var * {a: int64}') t = symbol('t', 'var * {a: int64}') eq(compute(s[s.a == t.a], {s: s_dd, t: t_dd}), s_data)
def test_concat(sql_two_tables): t_table, u_table = sql_two_tables t_data = pd.DataFrame(np.arange(5), columns=['a']) u_data = pd.DataFrame(np.arange(5, 10), columns=['a']) odo(t_data, t_table) odo(u_data, u_table) t = symbol('t', discover(t_data)) u = symbol('u', discover(u_data)) tm.assert_frame_equal( compute(concat(t, u).sort('a'), {t: t_table, u: u_table}, return_type=pd.DataFrame), pd.DataFrame(np.arange(10), columns=['a']), )
def test_concat(sql_two_tables): t_table, u_table = sql_two_tables t_data = pd.DataFrame(np.arange(5), columns=["a"]) u_data = pd.DataFrame(np.arange(5, 10), columns=["a"]) odo(t_data, t_table) odo(u_data, u_table) t = symbol("t", discover(t_data)) u = symbol("u", discover(u_data)) tm.assert_frame_equal( odo(compute(concat(t, u).sort("a"), {t: t_table, u: u_table}), pd.DataFrame), pd.DataFrame(np.arange(10), columns=["a"]), )
def test_str_cat_bcast(sql_with_null): t = symbol('t', discover(sql_with_null)) lit_sym = symbol('s', 'string') s = t[t.amount <= 200] result = compute(s.comment.str_cat(lit_sym, sep=' '), {t: sql_with_null, lit_sym: '!!'}, return_type=pd.Series) df = compute(s, sql_with_null, return_type=pd.DataFrame) expected = df.comment.str.cat(['!!']*len(df.comment), sep=' ') assert all(expected[~expected.isnull()] == result[~result.isnull()]) assert all(expected[expected.isnull()].index == result[result.isnull()].index)
def test_graph_double_join(): idx = [['A', 1], ['B', 2], ['C', 3], ['D', 4], ['E', 5], ['F', 6]] arc = [[1, 3], [2, 3], [4, 3], [5, 3], [3, 1], [2, 1], [5, 1], [1, 6], [2, 6], [4, 6]] wanted = [['A'], ['F']] t_idx = symbol('t_idx', 'var * {name: string, b: int32}') t_arc = symbol('t_arc', 'var * {a: int32, b: int32}') t_wanted = symbol('t_wanted', 'var * {name: string}') # >>> compute(join(t_idx, t_arc, 'b'), {t_idx: idx, t_arc: arc}) # [[1, A, 3], # [1, A, 2], # [1, A, 5], # [3, C, 1], # [3, C, 2], # [3, C, 4], # [3, C, 5], # [6, F, 1], # [6, F, 2], # [6, F, 4]] j = join(join(t_idx, t_arc, 'b'), t_wanted, 'name')[['name', 'b', 'a']] result = compute(j, {t_idx: idx, t_arc: arc, t_wanted: wanted}) result = sorted(map(tuple, result)) expected = sorted([('A', 1, 3), ('A', 1, 2), ('A', 1, 5), ('F', 6, 1), ('F', 6, 2), ('F', 6, 4)]) assert result == expected
def test_compute_with_variable_in_namespace(iris_server): test = iris_server t = symbol('t', discover(iris)) pl = symbol('pl', 'float32') expr = t[t.petal_length > pl].species tree = to_tree(expr, {pl: 'pl'}) blob = json.dumps({'expr': tree, 'namespace': {'pl': 5}}) resp = test.post('/compute.json', data=blob, content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data.decode('utf-8'))['data'] expected = list(compute(expr._subs({pl: 5}), {t: iris})) assert result == expected
def strcat_sym(): ''' blaze symbol used to test exceptions raised by cat() ''' ds = dshape('3 * {name: string, comment: string, num: int32}') s = symbol('s', dshape=ds) return s
def test_join(): left = [['Alice', 100], ['Bob', 200]] right = [['Alice', 1], ['Bob', 2]] L = symbol('L', 'var * {name: string, amount: int}') R = symbol('R', 'var * {name: string, id: int}') joined = join(L, R, 'name') assert dshape(joined.schema) == \ dshape('{name: string, amount: int, id: int}') result = list(compute(joined, {L: left, R: right})) expected = [('Alice', 100, 1), ('Bob', 200, 2)] assert result == expected
def test_slicing_with_lists(): nx = np.arange(20).reshape((4, 5)) dx = from_array(nx, (2, 2)) sx = symbol('x', discover(dx)) expr = sx[[2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[::2, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[1, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[[2, 0, 3], -2] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[:, :] assert compute(expr, dx).dask == dx.dask expr = sx[0] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[0, [3, 1, 4]] assert eq(np.array(compute(expr, dx)), compute(expr, nx))
def test_coerce_series(): s = pd.Series(list('1234'), name='a') dds = dd.from_pandas(s, npartitions=2) t = symbol('t', discover(s)) result = compute(t.coerce(to='int64'), dds) expected = pd.Series([1, 2, 3, 4], name=s.name) eq(result, expected)
def test_str_namespace(): t = symbol('t', 'var * {name: string}') assert bzs.upper(t.name).isidentical(t.name.str.upper()) assert bzs.lower(t.name).isidentical(t.name.str.lower()) assert (bzs.lower(bzs.upper(t.name)) .isidentical(t.name.str.upper().str.lower())) assert bzs.len(t.name).isidentical(t.name.str.len()) assert bzs.like(t.name, '*a').isidentical(t.name.str.like('*a')) assert (bzs.cat(bzs.cat(t.name, t.name, sep=' ++ '), t.name) .isidentical(t.name.str.cat(t.name, sep=' ++ ') .str.cat(t.name))) assert bzs.isalnum(t.name).isidentical(t.name.str.isalnum()) assert bzs.isalpha(t.name).isidentical(t.name.str.isalpha()) assert bzs.isdecimal(t.name).isidentical(t.name.str.isdecimal()) assert bzs.isdigit(t.name).isidentical(t.name.str.isdigit()) assert bzs.islower(t.name).isidentical(t.name.str.islower()) assert bzs.isnumeric(t.name).isidentical(t.name.str.isnumeric()) assert bzs.isspace(t.name).isidentical(t.name.str.isspace()) assert bzs.istitle(t.name).isidentical(t.name.str.istitle()) assert bzs.isupper(t.name).isidentical(t.name.str.isupper()) assert bzs.replace(t.name, 'A', 'a').isidentical(t.name.str.replace('A', 'a')) assert bzs.capitalize(t.name).isidentical(t.name.str.capitalize()) assert bzs.strip(t.name).isidentical(t.name.str.strip()) assert bzs.lstrip(t.name).isidentical(t.name.str.lstrip()) assert bzs.rstrip(t.name).isidentical(t.name.str.rstrip()) assert bzs.pad(t.name, 5).isidentical(t.name.str.pad(5)) assert (bzs.slice_replace(t.name, 1, 3, 'foo') .isidentical(t.name.str.slice_replace(1, 3, 'foo')))
def test_by_multi_column_grouper(): t = symbol('t', 'var * {x: int, y: int, z: int}') expr = by(t[['x', 'y']], total=t['z'].count()) data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)] print(set(compute(expr, data))) assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
def test_like(ds): t = symbol('t', ds) expr = getattr(t, 'name', t).like('Alice*') assert expr.pattern == 'Alice*' assert expr.schema.measure == dshape( '%sbool' % ('?' if '?' in ds else '') ).measure
def test_upper_schema(ds): t = symbol('t', ds) expr_upper = getattr(t, 'name', t).str.upper() expr_lower = getattr(t, 'name', t).str.lower() assert (expr_upper.schema.measure == expr_lower.schema.measure == dshape('%sstring' % ('?' if '?' in ds else '')).measure)
def test_pre_compute(): s = symbol('s', 'var * {a: int, b: int}') assert pre_compute(s, [(1, 2)]) == [(1, 2)] assert list(pre_compute(s, iter([(1, 2)]))) == [(1, 2)] assert list(pre_compute(s, iter([(1, 2), (3, 4)]))) == [(1, 2), (3, 4)] assert list(pre_compute(s, iter([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]))) == [(1, 2), (3, 4)]
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( compute(transformed.dist.max(), nyc, return_type=float) == compute(transformed.dist, nyc, return_type=pd.Series).max() )
def test_from_tree_is_robust_to_unnecessary_namespace(): t = symbol('t', 'var * {name: string, amount: int32}') expr = t.amount + 1 tree = to_tree(expr) # don't use namespace assert from_tree(tree, {'t': t}).isidentical(expr)
def test_shift_arithmetic(sql, n): t = symbol('t', discover(sql)) expr = t.B - t.B.shift(n) result = compute(expr, sql, return_type=pd.Series) df = odo(sql, pd.DataFrame) expected = df.B - df.B.shift(n) tm.assert_series_equal(result, expected)
def test_sample(big_sql): nn = symbol('nn', discover(big_sql)) nrows = odo(compute(nn.nrows, big_sql), int) result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame) assert len(result) == nrows // 2 result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame) assert len(result) == len(result2)
def test_to_tree(): t = symbol('t', 'var * {name: string, amount: int32}') expr = t.amount.sum() expected = { 'op': 'sum', 'args': [ { 'op': 'Field', 'args': [ { 'op': 'Symbol', 'args': [ 't', 'var * {name: string, amount: int32}', 0, ] }, 'amount' ] }, [0], False, ], } assert to_tree(expr) == expected
def test_coerce_bool_and_sum(sql): n = sql.name t = symbol(n, discover(sql)) expr = (t.B > 1.0).coerce(to='int32').sum() result = compute(expr, sql).scalar() expected = odo(compute(t.B, sql), pd.Series).gt(1).sum() assert result == expected
def test_shift_on_column(n, column, sql): sql = sql.data t = symbol('t', discover(sql)) expr = t[column].shift(n) result = compute(expr, sql, return_type=pd.Series) expected = odo(sql, pd.DataFrame)[column].shift(n) tm.assert_series_equal(result, expected)
def test_isin_selectable(sql): s = symbol('s', discover(sql)) # wrap the resource in a select assert compute(s.B.isin({1, 3}), sa.select(sql._resources()[sql].columns), return_type=list) == [(True,), (False,)]
def test_str_slice(slc, sql_with_null): name_series = pd.Series(['Alice', None, 'Drew', 'Bob', 'Drew', 'first', None], name='substring_1') t = symbol('t', discover(sql_with_null)) result = compute(t.name.str[slc], sql_with_null, return_type=pd.Series).fillna('zzz') result[result == ''] = 'zzz' expected = name_series.str[slc].fillna('zzz') tm.assert_series_equal(result, expected)
def test_to_from_tree_namespace(): t = symbol('t', 'var * {name: string, amount: int32}') expr = t.name tree = to_tree(expr, names={t: 't'}) assert tree == {'op': 'Field', 'args': ['t', 'name']} new = from_tree(tree, namespace={'t': t}) assert new.isidentical(expr)
def test_foreign_key_isin(fkey): t = symbol('fkey', discover(fkey)) expr = t.sym_id.isin([1, 2]) result = compute(expr, fkey, return_type='native') expected = """SELECT fkey.sym_id IN (%(sym_id_1)s, %(sym_id_2)s) AS anon_1 FROM fkey """ assert normalize(str(result)) == normalize(expected)
def test_timedelta_stat_reduction(sql_with_timedeltas, func): sym = symbol('s', discover(sql_with_timedeltas)) expr = getattr(sym.N, func)() deltas = pd.Series([timedelta(seconds=n) for n in range(10)]) expected = timedelta(seconds=getattr(deltas.astype('int64') / 1e9, func)(ddof=expr.unbiased)) assert compute(expr, sql_with_timedeltas, return_type=timedelta) == expected
def test_distinct_on(sql): t = symbol('t', discover(sql)) computation = compute(t[['A', 'B']].sort('A').distinct('A'), sql) assert normalize(str(computation)) == normalize(""" SELECT DISTINCT ON (anon_1."A") anon_1."A", anon_1."B" FROM (SELECT {tbl}."A" AS "A", {tbl}."B" AS "B" FROM {tbl}) AS anon_1 ORDER BY anon_1."A" ASC """.format(tbl=sql.name)) assert odo(computation, tuple) == (('a', 1), ('b', 2))
def test_dataset(): ns = {'t': df, 'x': 10} cache = dict() d = CachedDataset(ns, cache=cache) assert discover(d) == discover(ns) s = symbol('s', discover(d)) compute(s.x * 2, d) == 20 cache == {s.x * 2: 20}
def test_foreign_key_chain(fkey): t = symbol('t', discover(fkey)) expr = t.sym_id.main.data result = compute(expr, fkey) expected = """SELECT main.data FROM main, fkey, pkey WHERE fkey.sym_id = pkey.id and pkey.main = main.id """ assert normalize(str(result)) == normalize(expected)
def test_auto_join_field(orders): t = symbol('t', discover(orders)) expr = t.product_id.color result = compute(expr, orders) expected = """SELECT products.color FROM products, orders WHERE orders.product_id = products.product_id """ assert normalize(str(result)) == normalize(expected)
def test_map_datetime(): from datetime import datetime data = [['A', 0], ['B', 1]] t = symbol('t', 'var * {foo: string, datetime: int64}') result = list(compute(t['datetime'].map(datetime.utcfromtimestamp, 'datetime'), data)) expected = [datetime(1970, 1, 1, 0, 0, 0), datetime(1970, 1, 1, 0, 0, 1)] assert result == expected
def test_timedelta_arith(sql_with_dts): delta = timedelta(days=1) dates = pd.Series(pd.date_range('2014-01-01', '2014-02-01')) sym = symbol('s', discover(dates)) assert ( odo(compute(sym + delta, sql_with_dts), pd.Series) == dates + delta ).all() assert ( odo(compute(sym - delta, sql_with_dts), pd.Series) == dates - delta ).all()
def test_str_slice(slc, sql_with_null): name_series = pd.Series( ['Alice', None, 'Drew', 'Bob', 'Drew', 'first', None], name='substring_1') t = symbol('t', discover(sql_with_null)) result = compute(t.name.str[slc], sql_with_null, return_type=pd.Series).fillna('zzz') result[result == ''] = 'zzz' expected = name_series.str[slc].fillna('zzz') tm.assert_series_equal(result, expected)
def test_coalesce(sqla): t = symbol('t', discover(sqla)) assert ( compute(coalesce(t.B, -1), {t: sqla}, return_type=list) == [1, 1, -1] ) assert ( compute(coalesce(t.A, 'z'), {t: sqla}, return_type=list) == ['a', 'z', 'c'] )
def test_scalar(): s = symbol('s', '{name: string, id: int32, payments: var * {amount: int32, when: datetime}}') data = ('Alice', 1, ((100, datetime(2000, 1, 1, 1, 1 ,1)), (200, datetime(2000, 2, 2, 2, 2, 2)), (300, datetime(2000, 3, 3, 3, 3, 3)))) assert compute(s.name, data) == 'Alice' assert compute(s.id + 1, data) == 2 assert tuple(compute(s.payments.amount, data)) == (100, 200, 300) assert tuple(compute(s.payments.amount + 1, data)) == (101, 201, 301)
def test_like(): t = symbol('t', 'var * {name: string, city: string}') data = [('Alice Smith', 'New York'), ('Bob Smith', 'Chicago'), ('Alice Walker', 'LA')] assert list(compute(t.like(name='Alice*'), data)) == [data[0], data[2]] assert list(compute(t.like(name='lice*'), data)) == [] assert list(compute(t.like(name='*Smith*'), data)) == [data[0], data[1]] assert list(compute(t.like(name='*Smith*', city='New York'), data)) == [data[0]]
def test_coalesce(sqla): t = symbol('t', discover(sqla)) assert ( compute(coalesce(t.B, -1), {t: sqla}, return_type=list) == [(1,), (1,), (-1,)] ) assert ( compute(coalesce(t.A, 'z'), {t: sqla}, return_type=list) == [('a',), ('z',), ('c',)] )
def test_nested(): t = symbol('t', payment_dshape) assert list(compute(t.name, payments_ordered)) == ['Alice', 'Bob'] assert list(compute(t.payments, payments_ordered)) == \ [p[1] for p in payments_ordered] assert list(compute(t.payments.amount, payments_ordered)) == \ [(100, 200), (300, -400, 500)] assert list(compute(t.payments.amount + 1, payments_ordered)) ==\ [(101, 201), (301, -399, 501)]
def test_concat(sql_two_tables): t_table, u_table = sql_two_tables t_data = pd.DataFrame(np.arange(5), columns=['a']) u_data = pd.DataFrame(np.arange(5, 10), columns=['a']) odo(t_data, t_table) odo(u_data, u_table) t = symbol('t', discover(t_data)) u = symbol('u', discover(u_data)) tm.assert_frame_equal( odo( compute(concat(t, u).sort('a'), { t: t_table, u: u_table }), pd.DataFrame, ), pd.DataFrame(np.arange(10), columns=['a']), )
def test_compute_with_variable_in_namespace(iris_server, serial): test = iris_server t = symbol('t', discover(iris)) pl = symbol('pl', 'float32') expr = t[t.petal_length > pl].species tree = to_tree(expr, {pl: 'pl'}) blob = serial.dumps({'expr': tree, 'namespace': {'pl': 5}}) resp = test.post( '/compute.{name}'.format(name=serial.name), data=blob, ) assert 'OK' in resp.status data = serial.loads(resp.data) result = data['data'] expected = list(compute(expr._subs({pl: 5}), {t: iris})) assert result == expected assert data['names'] == ['species']
def test_datetime_access(): data = [['Alice', 100, 1, datetime(2000, 1, 1, 1, 1, 1)], ['Bob', 200, 2, datetime(2000, 1, 1, 1, 1, 1)], ['Alice', 50, 3, datetime(2000, 1, 1, 1, 1, 1)]] t = symbol('t', 'var * {amount: float64, id: int64, name: string, when: datetime}') assert list(compute(t.when.year, data)) == [2000, 2000, 2000] assert list(compute(t.when.second, data)) == [1, 1, 1] assert list(compute(t.when.date, data)) == [date(2000, 1, 1)] * 3
def test_by_groupby_deep(): data = [(1, 2, 'Alice'), (1, 3, 'Bob'), (2, 4, 'Alice'), (2, 4, '')] schema = '{x: int, y: int, name: string}' t = symbol('t', datashape.var * schema) t2 = t[t['name'] != ''] t3 = merge(t2.x, t2.name) expr = by(t3.name, avg=t3.x.mean()) result = set(compute(expr, data)) assert result == set([('Alice', 1.5), ('Bob', 1.0)])
def test_auto_join_projection(orders): t = symbol('t', discover(orders)) expr = t.product_id[['color', 'price']] result = compute(expr, orders) expected = """SELECT products.color, products.price FROM products, orders WHERE orders.product_id = products.product_id """ assert normalize(str(result)) == normalize(expected)
def test_datetime_access(attr, dtype, sql_with_dts): s = symbol('s', discover(sql_with_dts)) expr = getattr(s.A.dt, attr)() result = compute(expr, sql_with_dts, return_type=pd.Series) assert result.dtype == dtype assert_series_equal( result, getattr(compute(s.A, sql_with_dts, return_type=pd.Series).dt, attr), check_names=False, check_dtype=False, )
def test_group_by_map(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t[grouper], id_count=t.size.count()) result = compute(expr, fkey) expected = """SELECT fkey.sym_id, count(fkey.size) AS id_count FROM fkey GROUP BY fkey.sym_id """ assert normalize(str(result)) == normalize(expected)
def test_str_cat_with_null(sql_with_null, sep): t = symbol('t', discover(sql_with_null)) res = compute(t.name.str_cat(t.sex, sep=sep), sql_with_null, return_type=list) cols = compute(t[['name', 'sex']], sql_with_null, return_type=list) for r, (n, s) in zip(res, cols): if n is None or s is None: assert r is None else: assert (r == n + s if sep is None else r == n + sep + s)
def test_outer_join(sc): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = sc.parallelize(left) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = sc.parallelize(right) L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') assert set(compute(join(L, R), { L: left, R: right }).collect()) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), { L: left, R: right }).collect()) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set( compute(join(L, R, how='right'), { L: left, R: right }).collect()) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) # Full outer join not yet supported assert set( compute(join(L, R, how='outer'), { L: left, R: right }).collect()) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_multi_column_join(sc): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] rleft = sc.parallelize(left) rright = sc.parallelize(right) L = symbol('L', 'var * {x: int, y: int, z: int}') R = symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) result = compute(j, {L: rleft, R: rright}) expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)] assert set(result.collect()) == set(expected)
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = resource('hdfstore://%s' % fn) assert discover(hdf) == discover({'data': {'fixed': df}}) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.close()
def test_hdfstore(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/appendable', format='table') df.to_hdf(fn, '/fixed') hdf = resource('hdfstore://%s' % fn) s = symbol('s', discover(hdf)) assert isinstance(compute(s.fixed, hdf), (pd.DataFrame, pd.io.pytables.Fixed)) assert isinstance(compute(s.appendable, hdf), (pd.io.pytables.AppendableFrameTable, Chunks)) s = symbol('s', discover(df)) f = resource('hdfstore://%s::/fixed' % fn) a = resource('hdfstore://%s::/appendable' % fn) assert isinstance(pre_compute(s, a), Chunks) hdf.close() f.parent.close() a.parent.close()
def test_foreign_key_group_by(fkey, grouper): t = symbol('fkey', discover(fkey)) expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean()) result = compute(expr, fkey) expected = """SELECT pkey.sym, avg(pkey.price) AS avg_price FROM pkey, fkey WHERE fkey.sym_id = pkey.id GROUP BY pkey.sym """ assert normalize(str(result)) == normalize(expected)
def test_builtin_501_exception(iris_server, serial): t = symbol('t', discover(iris)) for name in ('map', 'apply'): func = getattr(t.species, name) expr = func(copy, 'int') query = {'expr': to_tree(expr)} response = iris_server.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) assert '501 Not Implemented'.lower() in response.status.lower()