def test_discovery_numeric_column(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table('name', metadata, sa.Column('name', sa.types.NUMERIC),) assert discover(s)
def test_concat_arr(): s_data = Series(np.arange(15)) t_data = Series(np.arange(15, 30)) s = symbol('s', discover(s_data)) t = symbol('t', discover(t_data)) assert ( compute(concat(s, t), {s: s_data, t: t_data}) == Series(np.arange(30)) ).all()
def test_dataset(): ns = {'t': df, 'x': 10} cache=dict() d = CachedDataset(ns, cache=cache) assert discover(d) == discover(ns) s = symbol('s', discover(d)) compute(s.x * 2, d) == 20 cache == {s.x * 2: 20}
def test_discovery(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table('accounts', metadata, sa.Column('name', sa.String), sa.Column('amount', sa.Integer), sa.Column('timestamp', sa.DateTime, primary_key=True)) assert discover(s) == \ dshape('var * {name: ?string, amount: ?int32, timestamp: datetime}')
def test_concat_mat(): s_data = DataFrame(np.arange(15).reshape(5, 3), columns=list('abc')) t_data = DataFrame(np.arange(15, 30).reshape(5, 3), columns=list('abc')) s = symbol('s', discover(s_data)) t = symbol('t', discover(t_data)) tm.assert_frame_equal( compute(concat(s, t), {s: s_data, t: t_data}), pd.DataFrame(np.arange(30).reshape(10, 3), columns=list('abc')), )
def test_client(): c = Client('localhost:6363') assert str(discover(c)) == str(discover(tdata)) t = symbol('t', discover(c)) expr = t.accounts.amount.sum() assert compute(expr, c) == 300 assert 'name' in t.accounts.fields assert isinstance(t.accounts.name, Field) assert compute(t.accounts.name, c) == ['Alice', 'Bob']
def test_join_promotion(): a_data = pd.DataFrame([[0.0, 1.5], [1.0, 2.5]], columns=list('ab')) b_data = pd.DataFrame([[0, 1], [1, 2]], columns=list('ac')) a = symbol('a', discover(a_data)) b = symbol('b', discover(b_data)) joined = join(a, b, 'a') assert joined.dshape == dshape('var * {a: float64, b: float64, c: int64}') expected = pd.merge(a_data, b_data, on='a') result = compute(joined, {a: a_data, b: b_data}) tm.assert_frame_equal(result, expected)
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = resource('hdfstore://%s' % fn) assert discover(hdf) == discover({'data': {'fixed': df}}) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.close()
def test_csv_join(): d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"} with filetexts(d): resource_a = resource("a.csv") resource_b = resource("b.csv") a = symbol("a", discover(resource_a)) b = symbol("b", discover(resource_b)) tm.assert_frame_equal( odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")), )
def test_join_suffixes(): df = pd.DataFrame( list(dict((k, n) for k in ascii_lowercase[:5]) for n in range(5)), ) a = symbol('a', discover(df)) b = symbol('b', discover(df)) suffixes = '_x', '_y' joined = join(a, b, 'a', suffixes=suffixes) expected = pd.merge(df, df, on='a', suffixes=suffixes) result = compute(joined, {a: df, b: df}) tm.assert_frame_equal(result, expected)
def test_concat(): d = {"a.csv": "a,b\n1,2\n3,4", "b.csv": "a,b\n5,6\n7,8"} with filetexts(d): a_rsc = resource("a.csv") b_rsc = resource("b.csv") a = symbol("a", discover(a_rsc)) b = symbol("b", discover(b_rsc)) tm.assert_frame_equal( odo(compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.arange(1, 9, dtype="int64").reshape(4, 2), columns=list("ab")), )
def test_multi_expression_compute(): a = Symbol('accounts', discover(accounts)) c = Symbol('cities', discover(cities)) expr = join(a, c) resp = test.post('/compute.json', data=json.dumps({'expr': to_tree(expr)}), content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data)['data'] expected = compute(expr, {a: accounts, c: cities}) assert list(map(tuple, result))== into(list, expected)
def test_str_predicates(what, expected): predicate = 'is' + what expr = getattr(t.name.str, predicate)() expected = pd.Series([expected, expected, expected], name='name') result = compute(expr, df).reset_index(drop=True) assert_series_equal(expected, result) assert discover(result).measure == expr.dshape.measure
def test_coerce_series_string_datetime(d, tp, ptp): s = pd.Series(d, name='a') e = symbol('t', discover(s)).coerce(to=tp) assert e.schema == dshape(tp) result = compute(e, s) expected = s.astype(ptp) assert_series_equal(result, expected)
def test_coerce_series(): s = pd.Series(list('1234'), name='a') dds = dd.from_pandas(s, npartitions=2) t = symbol('t', discover(s)) result = compute(t.coerce(to='int64'), dds) expected = pd.Series([1, 2, 3, 4], name=s.name) eq(result, expected)
def test_time_field(): data = pd.Series(pd.date_range(start='20120101', end='20120102', freq='H')) s = symbol('s', discover(data)) result = compute(s.time, data) expected = data.dt.time expected.name = 's_time' assert_series_equal(result, expected)
def test_add_data_to_empty_server(empty_server, serial): # add data with temp_server() as test: iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = empty_server.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape response2 = empty_server.get('/datashape') expected2 = str(discover({'iris': resource(iris_path)})) assert response2.data.decode('utf-8') == expected2 # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = empty_server.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
def test_odo_kwargs(test, serial): expr = t.dumb bad_query = {'expr': to_tree(expr)} result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(bad_query), ) assert result.status_code == 500 assert b'return_df must be passed' in result.data good_query = { 'expr': to_tree(expr), 'odo_kwargs': { 'return_df': odo(DumbResource.df, list), }, } result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(good_query) ) assert result.status_code == 200 data = serial.loads(result.data) dshape = discover(DumbResource.df) assert_dshape_equal( datashape.dshape(data['datashape']), dshape, ) assert_frame_equal( odo(data['data'], DataFrame, dshape=dshape), DumbResource.df, )
def test_add_data_to_server(temp_add_server, serial): # add data iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data client.requests = temp_add_server t = data(bz.Client('localhost:6363')) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path)}) assert result3 == expected3
def test_add_expanded_payload_has_effect(temp_add_server, serial): # Ensure that the expanded payload format actually passes the arguments # through to the resource constructor iris_path = example('iris-latin1.tsv') csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'} blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path, **csv_kwargs)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path, **csv_kwargs)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)}) assert result3 == expected3
def test_compute_on_db(bank, points): assert bank.database == points.database db = bank.database d = symbol(db.name, discover(db)) assert (compute(d.points.x.sum(), db) == sum(x['x'] for x in db.points.find()))
def test_add_data_to_server(temp_add_server, serial): # add data iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path)}) assert result3 == expected3
def test_pre_compute_with_projection_projects_on_data_frames(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) result = pre_compute(s[['sepal_length', 'sepal_width']].distinct(), csv, comfortable_memory=10) assert set(first(result).columns) == \ set(['sepal_length', 'sepal_width'])
def test_create_from_datashape(): engine = sa.create_engine('sqlite:///:memory:') ds = dshape('''{bank: var * {name: string, amount: int}, points: var * {x: int, y: int}}''') engine = create_from_datashape(engine, ds) assert discover(engine) == ds
def test_compute_kwargs(test, serial): expr = t.dumb.sort() bad_query = {'expr': to_tree(expr)} result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(bad_query), ) assert result.status_code == 500 assert b'return_df must be passed' in result.data good_query = { 'expr': to_tree(expr), 'compute_kwargs': { 'return_df': odo(DumbResource.df, list), }, } result = test.post( '/compute', headers=mimetype(serial), data=serial.dumps(good_query) ) assert result.status_code == 200 data = serial.loads(result.data) dshape = discover(DumbResource.df) assert_dshape_equal( datashape.dshape(data['datashape']), dshape, ) assert_frame_equal( odo(data['data'], DataFrame, dshape=dshape), DumbResource.df, )
def test_pre_compute_calls_lean_projection(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) result = pre_compute(s.sort('sepal_length').species, csv, comfortable_memory=10) assert set(first(result).columns) == \ set(['sepal_length', 'species'])
def test_unused_datetime_columns(): ds = dshape('2 * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) s = symbol('s', discover(csv)) assert into(list, compute(s.val, csv)) == ['a', 'b']
def test_datasets(): response = test.get('/datasets.json') assert json.loads(response.data) == {'accounts': str(accounts.dshape), 'accounts_df': str(discover(df)), 'cities': str(cities.dshape), 'pairs': str(pairs.dshape), 'times': str(times.dshape)}
def test_compute_kwargs(test, serial): expr = t.dumb.sort() bad_query = {'expr': to_tree(expr)} result = test.post('/compute', headers=mimetype(serial), data=serial.dumps(bad_query)) assert result.status_code == RC.INTERNAL_SERVER_ERROR assert b'return_df must be passed' in result.data good_query = { 'expr': to_tree(expr), 'compute_kwargs': { 'return_df': odo(DumbResource.df, list) } } result = test.post('/compute', headers=mimetype(serial), data=serial.dumps(good_query)) assert result.status_code == RC.OK tdata = serial.loads(result.data) dshape = discover(DumbResource.df) assert_dshape_equal(datashape.dshape(tdata['datashape']), dshape) assert_frame_equal( odo(serial.data_loads(tdata['data']), DataFrame, dshape=dshape), DumbResource.df)
def test_add_expanded_payload_has_effect(temp_add_server, serial): # Ensure that the expanded payload format actually passes the arguments # through to the resource constructor iris_path = example('iris-latin1.tsv') csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'} blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path, **csv_kwargs)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data client.requests = temp_add_server t = data(bz.Client('localhost:6363')) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)}) assert result3 == expected3
def test_timedelta_arith(): series = Series(pd.date_range('2014-01-01', '2014-02-01')) sym = symbol('s', discover(series)) delta = timedelta(days=1) assert (compute(sym + delta, series) == series + delta).all() assert (compute(sym - delta, series) == series - delta).all() assert (compute(sym - (sym - delta), series) == series - (series - delta)).all()
def test_strings(self): schema = '{x: int32, y: string}' dd = HDF5(self.filename, 'data', schema=schema) dd.extend([(1, 'Hello'), (2, 'World!')]) with h5py.File(dd.path) as f: d = f.get(dd.datapath) self.assertEqual(discover(d), dshape('2 * ' + schema))
def test_map_called_on_data_star(): r = data(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_map_called_on_resource_star(): r = resource(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_join_diff_contexts(db, ctx, cities): expr = join(db.t, db.s, 'name') people = ctx.table('t') cities = into(SchemaRDD, cities, dshape=discover(ctx.table('s'))) scope = {db: {'t': people, 's': cities}} result = compute(expr, scope) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert into(set, result) == into(set, expected)
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert isinstance(result, SchemaRDD) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_discovery(self): dd = HDF5(self.filename, 'data', schema='2 * int32') dd.extend([(1, 2), (2, 3), (4, 5)]) with h5py.File(dd.path) as f: d = f.get(dd.datapath) self.assertEqual(discover(d), dshape('3 * 2 * int32'))
def test_csv_with_trailing_commas(): with tmpfile('.csv') as fn: with open(fn, 'wt') as f: # note the trailing space in the header f.write('a,b,c, \n1, 2, 3, ') csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', '' ] with tmpfile('.csv') as fn: with open(fn, 'wt') as f: f.write('a,b,c,\n1, 2, 3, ') # NO trailing space in the header csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', 'Unnamed: 3' ]
def test_dplyr_transform(): df = DataFrame({'timestamp': pd.date_range('now', periods=5)}) t = symbol('t', discover(df)) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) lhs = compute(expr, df) rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()), name='date').to_frame()], axis=1) tm.assert_frame_equal(lhs, rhs)
def test_csv_with_trailing_commas(): with tmpfile('.csv') as fn: with open(fn, 'wt') as f: # note the trailing space in the header f.write('a,b,c, \n1, 2, 3, ') csv = CSV(fn) assert repr(Data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', '' ] with tmpfile('.csv') as fn: with open(fn, 'wt') as f: f.write('a,b,c,\n1, 2, 3, ') # NO trailing space in the header csv = CSV(fn) assert repr(Data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', 'Unnamed: 3' ]
def test_dplyr_transform(): df = DataFrame({'timestamp': pd.date_range('now', periods=5)}) t = symbol('t', discover(df)) expr = transform(t, date=t.timestamp.map(lambda x: x.date(), schema='datetime')) lhs = compute(expr, df) rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()), name='date').to_frame()], axis=1) assert str(lhs) == str(rhs)
def test_spider(data): result = spider(str(data)) ss = """{ %r: { 'foo.csv': var * {a: int64, b: int64}, 'foo.hdf5': {fooh5: 10 * 2 * float64}, sub: {'foo.json': 2 * {a: int64, b: float64, c: ?datetime, d: ?string}} } }""" % os.path.basename(str(data)) assert dshape(discover(result)) == dshape(ss)
def test_spider_cycle(data_with_cycle): result = spider(str(data_with_cycle), followlinks=True) ss = """{ %r: { 'foo.csv': var * {a: int64, b: int64}, 'foo.hdf5': {fooh5: 10 * 2 * float64}, sub: {'foo.json': 2 * {a: int64, b: float64, c: ?datetime, d: ?string}} } }""" % os.path.basename(str(data_with_cycle)) assert dshape(discover(result)) != dshape(ss)
def test_merge_with_common_subexpression(): df = DataFrame(np.random.rand(5, 2), columns=list('ab')) t = symbol('t', discover(df)) expr = merge((t.a - t.a % 3).label('a'), (t.a % 3).label('b')) result = compute(expr, {t: df}) expected = pd.concat( [pd.Series(df.a - df.a % 3, name='a'), pd.Series(df.a % 3, name='b')], axis=1) tm.assert_frame_equal(result, expected)
def test_concat(): d = {'a.csv': 'a,b\n1,2\n3,4', 'b.csv': 'a,b\n5,6\n7,8'} with filetexts(d): a_rsc = data('a.csv') b_rsc = data('b.csv') a = symbol('a', discover(a_rsc)) b = symbol('b', discover(b_rsc)) tm.assert_frame_equal( odo( compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2), columns=list('ab')), )
def test_csv_join(): d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5', 'b.csv': 'c,d,e\n2,3,4\n5,6,7'} with filetexts(d): data_a = data('a.csv') data_b = data('b.csv') a = symbol('a', discover(data_a)) b = symbol('b', discover(data_b)) tm.assert_frame_equal( odo( compute(join(a, b, 'c'), {a: data_a, b: data_b}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype='int64'), columns=list('cabde')) )
def test_transform_with_common_subexpression(): df = DataFrame(np.random.rand(5, 2), columns=list('ab')) t = symbol('t', discover(df)) expr = transform(t, c=t.a - t.a % 3, d=t.a % 3) result = compute(expr, df) expected = pd.concat( [df[c] for c in df.columns] + [pd.Series(df.a - df.a % 3, name='c'), pd.Series(df.a % 3, name='d')], axis=1) tm.assert_frame_equal(result, expected)
def test_datetime_access(): df = DataFrame({'name': ['Alice', 'Bob', 'Joe'], 'when': [datetime(2010, 1, 1, 1, 1, 1)] * 3, 'amount': [100, 200, 300], 'id': [1, 2, 3]}) t = symbol('t', discover(df)) for attr in ['day', 'month', 'minute', 'second']: assert (compute(getattr(t.when, attr), df) == \ Series([1, 1, 1])).all()
def test_hdfstore(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/appendable', format='table') df.to_hdf(fn, '/fixed') hdf = resource('hdfstore://%s' % fn) s = symbol('s', discover(hdf)) assert isinstance(compute(s.fixed, hdf), (pd.DataFrame, pd.io.pytables.Fixed)) assert isinstance(compute(s.appendable, hdf), (pd.io.pytables.AppendableFrameTable, Chunks)) s = symbol('s', discover(df)) f = resource('hdfstore://%s::/fixed' % fn) a = resource('hdfstore://%s::/appendable' % fn) assert isinstance(pre_compute(s, a), Chunks) hdf.close() f.parent.close() a.parent.close()
def test_builtin_501_exception(iris_server, serial): t = symbol('t', discover(iris)) for name in ('map', 'apply'): func = getattr(t.species, name) expr = func(copy, 'int') query = {'expr': to_tree(expr)} response = iris_server.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) assert '501 Not Implemented'.lower() in response.status.lower()
def test_streaming(): seq = [{'name': 'Alice', 'x': 1}, {'name': 'Bob', 'x': 1}] ns = {'t': seq, 'x': 10} cache = dict() d = CachedDataset(ns, cache=cache) s = symbol('s', discover(d)) expr = s.t.x * 2 result = compute(expr, d) assert not isinstance(d.cache[expr], Iterator) assert into(list, d.cache[expr]) == [2, 2]