def test_summary_on_ndarray(): assert compute(summary(total=a.sum(), min=a.min()), ax) == (ax.min(), ax.sum()) result = compute(summary(total=a.sum(), min=a.min(), keepdims=True), ax) expected = np.array([(ax.min(), ax.sum())], dtype=[("min", "float32"), ("total", "float64")]) assert result.ndim == ax.ndim assert eq(expected, result)
def test_summary_axis(): x = symbol('x', '5 * 3 * float32') assert summary(a=x.min(), b=x.max(), axis=0).dshape == \ dshape('3 * {a: float32, b: float32}') assert summary(a=x.min(), b=x.max(), axis=1).dshape == \ dshape('5 * {a: float32, b: float32}') assert summary(a=x.min(), b=x.max(), axis=1, keepdims=True).dshape == \ dshape('5 * 1 * {a: float32, b: float32}')
def test_summary_on_series(): ser = Series([1, 2, 3]) s = symbol('s', '3 * int') expr = summary(max=s.max(), min=s.min()) assert compute(expr, ser) == (3, 1) expr = summary(max=s.max(), min=s.min(), keepdims=True) assert compute(expr, ser) == [(3, 1)]
def test_summary_on_series(): ser = dd.from_pandas(pd.Series([1, 2, 3]), npartitions=2) s = symbol('s', '3 * int') expr = summary(max=s.max(), min=s.min()) assert compute(expr, ser) == (3, 1) expr = summary(max=s.max(), min=s.min(), keepdims=True) assert compute(expr, ser) == [(3, 1)]
def test_summary_on_ndarray(): assert compute(summary(total=a.sum(), min=a.min()), ax) == \ (ax.min(), ax.sum()) result = compute(summary(total=a.sum(), min=a.min(), keepdims=True), ax) expected = np.array([(ax.min(), ax.sum())], dtype=[('min', 'float32'), ('total', 'float64')]) assert result.ndim == ax.ndim assert eq(expected, result)
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) assert str(compute(expr, df)) == \ str(DataFrame([['Alice', 2, 150], ['Bob', 1, 200]], columns=['name', 'count', 'sum'])) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) assert str(compute(expr, df)) == \ str(DataFrame([['Alice', 2, 152], ['Bob', 1, 201]], columns=['name', 'count', 'sum']))
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) result = compute(expr, df) expected = DataFrame([['Alice', 2, 150], ['Bob', 1, 200]], columns=['name', 'count', 'sum']) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) result = compute(expr, df) expected = DataFrame([['Alice', 2, 152], ['Bob', 1, 201]], columns=['name', 'count', 'sum']) tm.assert_frame_equal(result, expected)
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) assert set(compute(expr, data)) == set([('Alice', 2, 150), ('Bob', 1, 200)]) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) assert set(compute(expr, data)) == set([('Alice', 2, 152), ('Bob', 1, 201)]) expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum() + 1)) assert set(compute(expr, data)) == set([('Alice', 2, 151), ('Bob', 1, 201)])
def test_summary(): t = TableSymbol("t", "{id: int32, name: string, amount: int32}") s = summary(total=t.amount.sum(), num=t.id.count()) assert s.dshape == dshape("{num: int32, total: int64}") assert hash(s) assert eval(str(s)).isidentical(s) assert "summary(" in str(s) assert "total=" in str(s) assert "num=" in str(s) assert str(t.amount.sum()) in str(s) assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum()) assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
def test_summary(): t = symbol('t', 'var * {id: int32, name: string, amount: int32}') s = summary(total=t.amount.sum(), num=t.id.count()) assert s.dshape == dshape('{num: int32, total: int64}') assert hash(s) assert eval(str(s)).isidentical(s) assert 'summary(' in str(s) assert 'total=' in str(s) assert 'num=' in str(s) assert str(t.amount.sum()) in str(s) assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum()) assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
def test_summary_with_mean(): (chunk, chunk_expr), (agg, agg_expr) = split(t, summary(a=t.amount.count(), b=t.id.mean() + 1)) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(a=chunk.amount.count(), b_total=chunk.id.sum(), b_count=chunk.id.count(), keepdims=True)) # assert not agg.schema == dshape('{a: int32, b: int32}') expected = summary(a=agg.a.sum(), b=(agg.b_total.sum() / agg.b_count.sum()) + 1) assert agg_expr.isidentical(expected)
def test_summary(): t = TableSymbol('t', '{id: int32, name: string, amount: int32}') s = summary(total=t.amount.sum(), num=t.id.count()) assert s.dshape == dshape('{num: int32, total: int32}') assert hash(s) assert eval(str(s)).isidentical(s) assert 'summary(' in str(s) assert 'total=' in str(s) assert 'num=' in str(s) assert str(t.amount.sum()) in str(s) assert not summary(total=t.amount.sum()).child.isidentical( t.amount.sum()) assert isinstance(summary(total=t.amount.sum() + 1).child, TableExpr)
def test_summary_with_mean(): (chunk, chunk_expr), (agg, agg_expr) = split( t, summary(a=t.amount.count(), b=t.id.mean() + 1)) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(a=chunk.amount.count(), b_total=chunk.id.sum(), b_count=chunk.id.count(), keepdims=True)) # assert not agg.schema == dshape('{a: int32, b: int32}') expected = summary(a=agg.a.sum(), b=(agg.b_total.sum() / agg.b_count.sum()) + 1) assert agg_expr.isidentical(expected)
def test_elemwise_with_multiple_paths(): s = symbol('s', 'var * {x: int, y: int, z: int}') expr = s.x.sum() / s.y.sum() (chunk, chunk_expr), (agg, agg_expr) = split(s, expr) assert chunk_expr.isidentical(summary(x=chunk.x.sum(), y=chunk.y.sum())) assert agg_expr.isidentical(agg.x / agg.y)
def test_lean_by_with_summary(): assert lean_projection(by(t.x, total=t.y.sum()))._child.isidentical(t[["x", "y"]]) tt = t[["x", "y"]] result = lean_projection(by(t.x, a=t.y.sum(), b=t.z.sum())[["x", "a"]]) expected = Projection(By(Field(tt, "x"), summary(a=sum(Field(tt, "y")))), ("x", "a")) assert result.isidentical(expected)
def test_summary_clean(): t2 = t[t.amount > 0] expr = summary(a=t2.amount.sum(), b=t2.id.count()) result = str(compute(expr, s)) assert normalize(result) == normalize(""" SELECT sum(accounts.amount) as a, count(accounts.id) as b FROM accounts WHERE accounts.amount > :amount_1""")
def test_summary_by(): expr = by(t.name, summary(a=t.amount.sum(), b=t.id.count())) result = str(compute(expr, s)) assert 'sum(accounts.amount) as a' in result.lower() assert 'count(accounts.id) as b' in result.lower() assert 'group by accounts.name' in result.lower()
def test_mean(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean()) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(total=chunk.amount.sum(), count=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(agg.total.sum() / agg['count'].sum())
def test_complex_summaries(): t = symbol('t', '100 * {a: int, b: int}') (chunk, chunk_expr), (agg, agg_expr) = split(t, summary(q=t.a.mean(), w=t.a.std(), e=t.a.sum())) assert chunk_expr.isidentical(summary(e=chunk.a.sum(), q_count=chunk.a.count(), q_total=chunk.a.sum(), w_n=chunk.a.count(), w_x=chunk.a.sum(), w_x2=(chunk.a ** 2).sum(), keepdims=True)) expected = summary(e=agg.e.sum(), q=agg.q_total.sum() / agg.q_count.sum(), w=sqrt((agg.w_x2.sum() / agg.w_n.sum()) - (agg.w_x.sum() / agg.w_n.sum()) ** 2)) assert agg_expr.isidentical(expected)
def test_complex_summaries(): t = symbol('t', '100 * {a: int, b: int}') (chunk, chunk_expr), (agg, agg_expr) = split( t, summary(q=t.a.mean(), w=t.a.std(), e=t.a.sum())) assert chunk_expr.isidentical( summary(e=chunk.a.sum(), q_count=chunk.a.count(), q_total=chunk.a.sum(), w_n=chunk.a.count(), w_x=chunk.a.sum(), w_x2=(chunk.a**2).sum(), keepdims=True)) expected = summary(e=agg.e.sum(), q=agg.q_total.sum() / agg.q_count.sum(), w=sqrt((agg.w_x2.sum() / agg.w_n.sum()) - (agg.w_x.sum() / agg.w_n.sum())**2)) assert agg_expr.isidentical(expected)
def test_mean(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean()) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(total=chunk.amount.sum(), count=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(agg.total.sum() / agg.count.sum())
def test_lean_by_with_summary(): assert lean_projection(by(t.x, total=t.y.sum()))._child.isidentical( t[['x', 'y']], ) tt = t[['x', 'y']] result = lean_projection(by(t.x, a=t.y.sum(), b=t.z.sum())[['x', 'a']]) expected = Projection( By(Field(tt, 'x'), summary(a=sum(Field(tt, 'y')))), ('x', 'a'), ) assert result.isidentical(expected)
def test_summary_on_ndarray_with_axis(): for axis in [0, 1, (1, 0)]: expr = summary(total=a.sum(), min=a.min(), axis=axis) result = compute(expr, ax) shape, dtype = to_numpy(expr.dshape) expected = np.empty(shape=shape, dtype=dtype) expected['total'] = ax.sum(axis=axis) expected['min'] = ax.min(axis=axis) assert eq(result, expected)
def test_std(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.std()) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(x=chunk.amount.sum(), x2=(chunk.amount ** 2).sum(), n=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(sqrt((agg.x2.sum() / (agg.n.sum()) - (agg.x.sum() / (agg.n.sum())) ** 2)))
def test_var(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.var()) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(x=chunk.amount.sum(), x2=(chunk.amount**2).sum(), n=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical( (agg.x2.sum() / (agg.n.sum()) - (agg.x.sum() / (agg.n.sum()))**2))
def test_path_split(): expr = t.amount.sum() + 1 assert path_split(t, expr).isidentical(t.amount.sum()) expr = t.amount.distinct().sort() assert path_split(t, expr).isidentical(t.amount.distinct()) t2 = transform(t, id=t.id * 2) expr = by(t2.id, amount=t2.amount.sum()).amount + 1 assert path_split(t, expr).isidentical(by(t2.id, amount=t2.amount.sum())) expr = count(t.amount.distinct()) assert path_split(t, expr).isidentical(t.amount.distinct()) expr = summary(total=t.amount.sum()) assert path_split(t, expr).isidentical(expr)
def test_join_on_same_table(): metadata = sa.MetaData() T = sa.Table( 'tab', metadata, sa.Column('a', sa.Integer), sa.Column('b', sa.Integer), ) t = symbol('tab', discover(T)) expr = join(t, t, 'a') result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT tab_left.a, tab_left.b, tab_right.b FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """) expr = join(t, t, 'a').b_left.sum() result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" with alias as (select tab_left.b as b from tab as tab_left join tab as tab_right on tab_left.a = tab_right.a) select sum(alias.b) as b_left_sum from alias""") expr = join(t, t, 'a') expr = summary(total=expr.a.sum(), smallest=expr.b_right.min()) result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT min(tab_right.b) as smallest, sum(tab_left.a) as total FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """)
def test_join_on_same_table(): metadata = sa.MetaData() T = sa.Table('tab', metadata, sa.Column('a', sa.Integer), sa.Column('b', sa.Integer), ) t = symbol('tab', discover(T)) expr = join(t, t, 'a') result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT tab_left.a, tab_left.b, tab_right.b FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """) expr = join(t, t, 'a').b_left.sum() result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" with alias as (select tab_left.b as b from tab as tab_left join tab as tab_right on tab_left.a = tab_right.a) select sum(alias.b) as b_left_sum from alias""") expr = join(t, t, 'a') expr = summary(total=expr.a.sum(), smallest=expr.b_right.min()) result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT min(tab_right.b) as smallest, sum(tab_left.a) as total FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """)
def test_summary(): (chunk, chunk_expr), (agg, agg_expr) = split( t, summary(a=t.amount.count(), b=t.id.sum() + 1)) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(a=chunk.amount.count(), b=chunk.id.sum(), keepdims=True)) # assert not agg.schema == dshape('{a: int32, b: int32}') assert agg_expr.isidentical(summary(a=agg.a.sum(), b=agg.b.sum() + 1)) (chunk, chunk_expr), (agg, agg_expr) = \ split(t, summary(total=t.amount.sum())) assert chunk_expr.isidentical( summary(total=chunk.amount.sum(), keepdims=True)) assert agg_expr.isidentical(summary(total=agg.total.sum()))
def test_summary(): (chunk, chunk_expr), (agg, agg_expr) = split(t, summary(a=t.amount.count(), b=t.id.sum() + 1)) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(a=chunk.amount.count(), b=chunk.id.sum(), keepdims=True)) # assert not agg.schema == dshape('{a: int32, b: int32}') assert agg_expr.isidentical(summary(a=agg.a.sum(), b=agg.b.sum() + 1)) (chunk, chunk_expr), (agg, agg_expr) = \ split(t, summary(total=t.amount.sum())) assert chunk_expr.isidentical(summary(total=chunk.amount.sum(), keepdims=True)) assert agg_expr.isidentical(summary(total=agg.total.sum()))
def test_summary_keepdims(): x = symbol('x', '5 * 3 * float32') assert summary(a=x.min(), b=x.max()).dshape == \ dshape('{a: float32, b: float32}') assert summary(a=x.min(), b=x.max(), keepdims=True).dshape == \ dshape('1 * 1 * {a: float32, b: float32}')
def test_summary_with_multiple_children(): t = symbol('t', 'var * {x: int, y: int, z: int}') assert summary(a=t.x.sum() + t.y.sum())._child.isidentical(t)
def test_summary(): expr = summary(count=t.id.count(), sum=t.amount.sum()) assert str(compute(expr, df)) == str(Series({'count': 3, 'sum': 350}))
def test_by_summary(): t = symbol('t', 'var * {name: string, amount: int32, id: int32}') a = by(t['name'], sum=sum(t['amount'])) b = by(t['name'], summary(sum=sum(t['amount']))) assert a.isidentical(b)
def test_summary(): expr = summary(count=t.id.count(), sum=t.amount.sum()) assert_series_equal(compute(expr, df), Series({'count': 3, 'sum': 350}))
def test_summary_keepdims(): expr = summary(count=t.id.count(), sum=t.amount.sum(), keepdims=True) expected = DataFrame([[3, 350]], columns=['count', 'sum']) tm.assert_frame_equal(compute(expr, df), expected)
def test_summary(): expr = summary(count=t.id.count(), sum=t.amount.sum()) assert compute(expr, data) == (3, 350)
def test_summary_by_reduction_arithmetic(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum() + 1)) assert str(compute(expr, df)) == \ str(DataFrame([['Alice', 2, 151], ['Bob', 1, 202]], columns=['name', 'count', 'sum']))
def test_summary_by_reduction_arithmetic(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum() + 1)) result = compute(expr, df) expected = DataFrame([['Alice', 2, 151], ['Bob', 1, 201]], columns=['name', 'count', 'sum']) tm.assert_frame_equal(result, expected)
def test_summary(): expr = summary(count=t.id.count(), sum=t.amount.sum()) eq(compute(expr, ddf), pd.Series({'count': 3, 'sum': 350}))
def test_by_summary(): t = TableSymbol('t', '{name: string, amount: int32, id: int32}') a = by(t['name'], sum=sum(t['amount'])) b = by(t['name'], summary(sum=sum(t['amount']))) assert a.isidentical(b)
def test_summary_str(): x = symbol('x', '5 * 3 * float32') assert 'keepdims' not in str(summary(a=x.min(), b=x.max()))