def test_sample(big_sql): nn = symbol('nn', discover(big_sql)) nrows = odo(compute(nn.nrows, big_sql), int) result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame) assert len(result) == nrows // 2 result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame) assert len(result) == len(result2)
def test_compute(): exprs = [2*sx + 1, sx.sum(axis=0), sx.mean(axis=0), sx + sx, sx.T, sx.T + sy, sx.dot(sy), sy.dot(sx), sx.sum(), sx - sx.sum(), sx.dot(sx.T), sx.sum(axis=1), sy + sa, sy + sb, sx[3:17], sx[3:10, 10:25:2] + 1, sx[:5, 10], sx[0, 0] ] for expr in exprs: result = compute(expr, dask_ns) expected = compute(expr, numpy_ns) assert isinstance(result, Array) if expr.dshape.shape: result2 = np.array(result) else: result2 = float(result) assert eq(result2, expected)
def test_literals(db, ctx): expr = db.t[db.t.amount >= 100] result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_field_access(db, ctx): for field in db.t.fields: expr = getattr(db.t, field) result = into(pd.Series, compute(expr, ctx)) expected = compute(expr, {db: {"t": df}}) assert result.name == expected.name np.testing.assert_array_equal(result.values, expected.values)
def test_sort(ctx, db, field, ascending): expr = db.t.sort(field, ascending=ascending) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_selection(ctx, db): expr = db.t[db.t.amount > 50] result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_str_len(ctx, db): expr = db.t.name.str.len() result = odo(compute(expr, ctx, return_type='native'), pd.Series) expected = compute(expr, {db: {'t': df}}, return_type='native') assert result.name == 'name' assert expected.name == 'name' assert odo(result, set) == odo(expected, set)
def test_multikey_by(ctx, db, reducer, reduction): t = db.t expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def test_url_csv_data(iris_local): iris_remote = data(iris_url) assert isinstance(iris_remote.data, URL(CSV)) iris_remote_df = compute(iris_remote) assert isinstance(iris_remote_df, pd.DataFrame) iris_local_df = compute(iris_local) tm.assert_frame_equal(iris_remote_df, iris_local_df)
def plot(self, output_file="termite.html"): t = blz.Data(self.input_file) df = pd.read_csv(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) #p.xaxis().major_label_orientation = np.pi/3 logging.info("generating termite plot for file %s" % self.input_file) plt.show(p)
def test_math(ctx, db, func): expr = func(db.t.amount) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) np.testing.assert_allclose(np.sort(odo(result, np.ndarray, dshape=expr.dshape)), np.sort(odo(expected, np.ndarray)))
def test_reductions(data): assert eq(compute(s.sum(), data), x.sum()) assert eq(compute(s.sum(axis=1), data), x.sum(axis=1)) assert eq(compute(s.sum(axis=0), data), x.sum(axis=0))
def test_by(ctx, db, grouper, reducer, reduction): t = db.t expr = by(t[grouper], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def test_strlen(ctx, db): expr = db.t.name.strlen() result = odo(compute(expr, ctx), pd.Series) expected = compute(expr, {db: {'t': df}}) assert result.name == 'name' assert expected.name == 'name' assert odo(result, set) == odo(expected, set)
def test_coerce_bool_and_sum(sql): n = sql.name t = symbol(n, discover(sql)) expr = (t.B > 1.0).coerce(to='int32').sum() result = compute(expr, sql).scalar() expected = odo(compute(t.B, sql), pd.Series).gt(1).sum() assert result == expected
def plot(self, output_file="termite.html"): import blaze as blz from odo import into import pandas as pd import bokeh.plotting as plt from bokeh.models.sources import ColumnDataSource t = blz.Data(self.input_file) MAX = blz.compute(t.weight.max()) MIN = blz.compute(t.weight.min()) # Create a size variable to define the size of the the circle for the plot. t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50) WORDS = t['word'].distinct() WORDS = into(list, WORDS) topics = t['topic'].distinct() topics = into(list, topics) # Convert topics to strings TOPICS = [str(i) for i in topics] source = into(pd.DataFrame, t) plt.output_file(output_file) data_source = ColumnDataSource(source) p = plt.figure(x_range=TOPICS, y_range=WORDS, plot_width=1000, plot_height=1700, title=self.title) p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source) plt.show(p)
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( compute(transformed.dist.max(), nyc, return_type=float) == compute(transformed.dist, nyc, return_type=pd.Series).max() )
def test_compute_on_file(file): s = symbol('s', discover(file)) assert eq(compute(s.x.sum(axis=1), file), x.sum(axis=1)) assert eq(compute(s.x.sum(), file, chunksize=(4, 6)), x.sum())
def test_by_with_date(ctx, db, attr): # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1 # TODO: see if we can use odo to convert the dshape of an existing # DataFrame expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean()) result = odo(compute(expr, ctx), pd.DataFrame).sort("mean").reset_index(drop=True) expected = compute(expr, {db: {"dates": date_df}}).sort("mean").reset_index(drop=True) tm.assert_frame_equal(result, expected, check_dtype=False)
def test_expr_client_interactive(): c = Client('localhost:6363') t = bz_data(c) assert compute(t.accounts.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.accounts.name, min=t.accounts.amount.min(), max=t.accounts.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def test_expr_client_interactive(): ec = ExprClient('localhost:5000', 'accounts_df') t = Table(ec) assert compute(t.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.name, min=t.amount.min(), max=t.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def test_column(sql): t = Data(sql) r = compute(t['x']) assert r == [1, 10, 100] assert compute(t[['x']]) == [(1,), (10,), (100,)] assert compute(t.count()) == 3
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {"t": df, "s": cities_df}}) assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join_diff_contexts(db, ctx, cities): expr = join(db.t, db.s, "name") people = ctx.table("t") cities = into(ctx, cities, dshape=discover(ctx.table("s"))) scope = {db: {"t": people, "s": cities}} result = compute(expr, scope) expected = compute(expr, {db: {"t": df, "s": cities_df}}) assert set(map(frozenset, odo(result, set))) == set(map(frozenset, odo(expected, set)))
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native') assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert isinstance(result, (SparkDataFrame, SchemaRDD)) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_sample_frac(nyc): t = symbol('t', discover(nyc)) result = compute(t.sample(frac=0.5), nyc, return_type=pd.DataFrame) num_rows = compute(t.nrows, nyc, return_type=int) # *Sigh* have to do proper rounding manually; Python's round() builtin is # borked. fractional, integral = math.modf(num_rows * 0.5) assert int(integral + (0 if fractional < 0.5 else 1)) == len(result)
def test_sample(sql): t = symbol('t', discover(sql)) result = compute(t.sample(n=1), sql) s = odo(result, pd.DataFrame) assert len(s) == 1 result2 = compute(t.sample(frac=0.5), sql) s2 = odo(result2, pd.DataFrame) assert len(s) == len(s2)
def test_map_called_on_data_star(): r = data(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_summary_on_series(): ser = dd.from_pandas(pd.Series([1, 2, 3]), npartitions=2) s = symbol('s', '3 * int') expr = summary(max=s.max(), min=s.min()) assert compute(expr, ser) == (3, 1) expr = summary(max=s.max(), min=s.min(), keepdims=True) assert compute(expr, ser) == [(3, 1)]
def test_scope_gets_updated_after_optimize_call(): a = symbol('a', 'int') result = compute(a + 1, Foo('foo'), optimize=optimize) assert result.data == 'foo'
def test_boolean(ctx, db): expr = db.t.amount > 50 result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result, dshape=expr.dshape) == into(set, expected)
def test_nunique_spark_dataframe(ctx, db): result = odo(compute(db.t.nunique(), ctx, return_type='native'), int) expected = ctx.table('t').distinct().count() assert result == expected
def test_nyc_csv(nyc_csv): t = symbol('t', discover(nyc_csv)) assert compute(t.nrows, nyc_csv, return_type='core') > 0
def test_projection(db, ctx): expr = db.t[['id', 'name']] result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result) == into(set, expected)
def test_head(db, ctx): expr = db.t[['name', 'amount']].head(2) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(list, result) == into(list, expected)
def test_field_distinct(ctx, db): expr = db.t.name.distinct() result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result, dshape=expr.dshape) == into(set, expected)
def test_summary_complex_arith(bank): expr = by(t.name, arith=(100 - t.amount * 2 / 30.0).sum()) result = compute(expr, bank) reducer = lambda acc, x: (100 - x['amount'] * 2 / 30.0) + acc expected = reduceby('name', reducer, bank.find(), 0) assert set(result) == set(expected.items())
def test_sample_n(nyc): t = symbol('t', discover(nyc)) result = compute(t.sample(n=14), nyc, return_type=pd.DataFrame) assert len(result) == 14
def test_join_type_promotion(sqla, sqlb): t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb)) expr = join(t, s, 'B', how='inner') result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}, return_type='native').execute().fetchall())) expected = set([(1, 'a', 'a'), (1, None, 'a')]) assert result == expected
def test_relabel_columns_over_selection(big_sql): t = symbol('t', discover(big_sql)) result = compute(t[t['B'] == 2].relabel(B=u'b'), big_sql, return_type=pd.DataFrame) expected = pd.DataFrame([['a', 2]], columns=[u'A', u'b']) tm.assert_frame_equal(result, expected)
def test_postgres_isnan(sql_with_float): dta = (1.0,), (float('nan'),) table = odo(dta, sql_with_float) sym = symbol('s', discover(dta)) assert compute(sym.isnan(), table, return_type=list) == [(False,), (True,)]
def test_by_summary(db, ctx): t = db.t expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result) == into(set, expected)
def test_reductions(ctx, db, field, reduction): expr = getattr(db.t[field], reduction)() result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(list, result)[0][0] == expected
def test_symbol_compute(db, ctx): assert isinstance(compute(db.t, ctx, return_type='native'), SparkDataFrame)
def test_selection_field(ctx, db): expr = db.t[db.t.amount > 50].name result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result, dshape=expr.dshape) == into(set, expected)
def test_compute_chunks_on_single_csv(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) expr = s.sepal_length.max() assert compute(expr, {s: csv}, comfortable_memory=10, chunksize=50) == 7.9
def test_like_mulitple_no_match(big_bank): # make sure we aren't OR-ing the matches expr = bigt.like(name='*York*', city='*Bob*') result = compute(expr, big_bank) assert not set(result)
def test_floor_ceil(bank): t = symbol('t', discover(bank)) assert set(compute(200 * floor(t.amount / 200), bank)) == set([0, 200]) assert set(compute(200 * ceil(t.amount / 200), bank)) == set([200, 400])
def test_core_compute(ctx, db): assert isinstance(compute(db.t, ctx, return_type='core'), pd.DataFrame) assert isinstance(compute(db.t.amount, ctx, return_type='core'), pd.Series) assert iscorescalar(compute(db.t.amount.mean(), ctx, return_type='core')) assert isinstance(compute(db.t, ctx, return_type=list), list)
def test_missing_values(missing_vals): assert discover(missing_vals).subshape[0] == \ dshape('{x: int64, y: ?int64, z: ?int64}') assert set(compute(p.y, missing_vals)) == set([None, 20, None, 40])
def test_like(bank): bank.create_index([('name', pymongo.TEXT)]) expr = t.like(name='*Alice*') result = compute(expr, bank) assert set(result) == set((('Alice', 100), ('Alice', 200)))
def test_like_multiple(big_bank): expr = bigt.like(name='*Bob*', city='*York*') result = compute(expr, big_bank) assert set(result) == set( (('Bob', 100, 'New York City'), ('Bob', 200, 'New York City')))
def test_by_non_native_ops(ctx, db): expr = by(db.t.id, total=db.t.id.nunique()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
def test_map(ctx, db): expr = db.t.id.map(lambda x: x + 1, 'int') result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result, dshape=expr.dshape) == into(set, expected)
def test_isin(ctx, db, keys): expr = db.t[db.t.id.isin(keys)] result = odo(compute(expr, ctx, return_type='native'), set) expected = odo(compute(expr, {db: {'t': df}}, return_type='native'), set) assert (set(map(frozenset, odo(result, list))) == set(map(frozenset, odo(expected, list))))
def test_column_arithmetic(ctx, db): expr = db.t.amount + 1 result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert into(set, result, dshape=expr.dshape) == into(set, expected)
def test_grouper_with_arith(ctx, db): expr = by(db.t[['id', 'amount']], total=(db.t.amount + 1).sum()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
def test_sample_bounded(nyc): t = symbol('t', discover(nyc)) nrows = compute(t.nrows, nyc, return_type=int) result = compute(t.sample(n=2*nrows), nyc, return_type=pd.DataFrame) assert len(result) == nrows
def test_core_compute(nyc): t = symbol('t', discover(nyc)) assert isinstance(compute(t, nyc, return_type='core'), pd.DataFrame) assert isinstance(compute(t.passenger_count, nyc, return_type='core'), pd.Series) assert iscorescalar(compute(t.passenger_count.mean(), nyc, return_type='core')) assert isinstance(compute(t, nyc, return_type=list), list)