Пример #1
0
def test_sample(big_sql):
    nn = symbol('nn', discover(big_sql))
    nrows = odo(compute(nn.nrows, big_sql), int)
    result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame)
    assert len(result) == nrows // 2
    result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame)
    assert len(result) == len(result2)
Пример #2
0
def test_compute():
    exprs = [2*sx + 1,
            sx.sum(axis=0),
            sx.mean(axis=0),
            sx + sx,
            sx.T,
            sx.T + sy,
            sx.dot(sy),
            sy.dot(sx),
            sx.sum(),
            sx - sx.sum(),
            sx.dot(sx.T),
            sx.sum(axis=1),
            sy + sa,
            sy + sb,
            sx[3:17],
            sx[3:10, 10:25:2] + 1,
            sx[:5, 10],
            sx[0, 0] ]
    for expr in exprs:
        result = compute(expr, dask_ns)
        expected = compute(expr, numpy_ns)
        assert isinstance(result, Array)
        if expr.dshape.shape:
            result2 = np.array(result)
        else:
            result2 = float(result)
        assert eq(result2, expected)
Пример #3
0
def test_literals(db, ctx):
    expr = db.t[db.t.amount >= 100]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #4
0
def test_field_access(db, ctx):
    for field in db.t.fields:
        expr = getattr(db.t, field)
        result = into(pd.Series, compute(expr, ctx))
        expected = compute(expr, {db: {"t": df}})
        assert result.name == expected.name
        np.testing.assert_array_equal(result.values, expected.values)
Пример #5
0
def test_sort(ctx, db, field, ascending):
    expr = db.t.sort(field, ascending=ascending)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #6
0
def test_selection(ctx, db):
    expr = db.t[db.t.amount > 50]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #7
0
def test_str_len(ctx, db):
    expr = db.t.name.str.len()
    result = odo(compute(expr, ctx, return_type='native'), pd.Series)
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert result.name == 'name'
    assert expected.name == 'name'
    assert odo(result, set) == odo(expected, set)
Пример #8
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Пример #9
0
def test_url_csv_data(iris_local):
    iris_remote = data(iris_url)
    assert isinstance(iris_remote.data, URL(CSV))
    iris_remote_df = compute(iris_remote)
    assert isinstance(iris_remote_df, pd.DataFrame)
    iris_local_df = compute(iris_local)
    tm.assert_frame_equal(iris_remote_df, iris_local_df)
Пример #10
0
    def plot(self, output_file="termite.html"):
        t = blz.Data(self.input_file)
        df = pd.read_csv(self.input_file)

        MAX =  blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
               plot_width=1000, plot_height=1700,
               title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        #p.xaxis().major_label_orientation = np.pi/3
        logging.info("generating termite plot for file %s" % self.input_file)
        plt.show(p)
Пример #11
0
def test_math(ctx, db, func):
    expr = func(db.t.amount)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    np.testing.assert_allclose(np.sort(odo(result, np.ndarray,
                                           dshape=expr.dshape)),
                               np.sort(odo(expected, np.ndarray)))
Пример #12
0
def test_reductions(data):
    assert eq(compute(s.sum(), data),
              x.sum())
    assert eq(compute(s.sum(axis=1), data),
              x.sum(axis=1))
    assert eq(compute(s.sum(axis=0), data),
              x.sum(axis=0))
Пример #13
0
def test_by(ctx, db, grouper, reducer, reduction):
    t = db.t
    expr = by(t[grouper], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Пример #14
0
def test_strlen(ctx, db):
    expr = db.t.name.strlen()
    result = odo(compute(expr, ctx), pd.Series)
    expected = compute(expr, {db: {'t': df}})
    assert result.name == 'name'
    assert expected.name == 'name'
    assert odo(result, set) == odo(expected, set)
Пример #15
0
def test_coerce_bool_and_sum(sql):
    n = sql.name
    t = symbol(n, discover(sql))
    expr = (t.B > 1.0).coerce(to='int32').sum()
    result = compute(expr, sql).scalar()
    expected = odo(compute(t.B, sql), pd.Series).gt(1).sum()
    assert result == expected
Пример #16
0
    def plot(self, output_file="termite.html"):
        import blaze as blz
        from odo import into
        import pandas as pd
        import bokeh.plotting as plt
        from bokeh.models.sources import ColumnDataSource

        t = blz.Data(self.input_file)

        MAX = blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
                       plot_width=1000, plot_height=1700,
                       title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        plt.show(p)
Пример #17
0
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        compute(transformed.dist.max(), nyc, return_type=float) ==
        compute(transformed.dist, nyc, return_type=pd.Series).max()
    )
Пример #18
0
def test_compute_on_file(file):
    s = symbol('s', discover(file))

    assert eq(compute(s.x.sum(axis=1), file),
              x.sum(axis=1))

    assert eq(compute(s.x.sum(), file, chunksize=(4, 6)),
              x.sum())
Пример #19
0
def test_by_with_date(ctx, db, attr):
    # TODO: investigate CSV writing precision between pandas 0.16.0 and 0.16.1
    # TODO: see if we can use odo to convert the dshape of an existing
    #       DataFrame
    expr = by(getattr(db.dates.ds, attr), mean=db.dates.amount.mean())
    result = odo(compute(expr, ctx), pd.DataFrame).sort("mean").reset_index(drop=True)
    expected = compute(expr, {db: {"dates": date_df}}).sort("mean").reset_index(drop=True)
    tm.assert_frame_equal(result, expected, check_dtype=False)
Пример #20
0
def test_expr_client_interactive():
    c = Client('localhost:6363')
    t = bz_data(c)

    assert compute(t.accounts.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.accounts.name, min=t.accounts.amount.min(),
                                                  max=t.accounts.amount.max())))
            == set([('Alice', 100, 100), ('Bob', 200, 200)]))
Пример #21
0
def test_expr_client_interactive():
    ec = ExprClient('localhost:5000', 'accounts_df')
    t = Table(ec)

    assert compute(t.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.name, min=t.amount.min(),
                                 max=t.amount.max()))) ==
            set([('Alice', 100, 100), ('Bob', 200, 200)]))
Пример #22
0
def test_column(sql):
    t = Data(sql)

    r = compute(t['x'])
    assert r == [1, 10, 100]
    assert compute(t[['x']]) == [(1,), (10,), (100,)]

    assert compute(t.count()) == 3
Пример #23
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {"t": df, "s": cities_df}})

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #24
0
def test_join_diff_contexts(db, ctx, cities):
    expr = join(db.t, db.s, "name")
    people = ctx.table("t")
    cities = into(ctx, cities, dshape=discover(ctx.table("s")))
    scope = {db: {"t": people, "s": cities}}
    result = compute(expr, scope)
    expected = compute(expr, {db: {"t": df, "s": cities_df}})
    assert set(map(frozenset, odo(result, set))) == set(map(frozenset, odo(expected, set)))
Пример #25
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native')

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #26
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, (SparkDataFrame, SchemaRDD))
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #27
0
def test_sample_frac(nyc):
    t = symbol('t', discover(nyc))
    result = compute(t.sample(frac=0.5), nyc, return_type=pd.DataFrame)
    num_rows = compute(t.nrows, nyc, return_type=int)
    # *Sigh* have to do proper rounding manually; Python's round() builtin is
    # borked.
    fractional, integral = math.modf(num_rows * 0.5)
    assert int(integral + (0 if fractional < 0.5 else 1)) == len(result)
Пример #28
0
def test_sample(sql):
    t = symbol('t', discover(sql))
    result = compute(t.sample(n=1), sql)
    s = odo(result, pd.DataFrame)
    assert len(s) == 1
    result2 = compute(t.sample(frac=0.5), sql)
    s2 = odo(result2, pd.DataFrame)
    assert len(s) == len(s2)
Пример #29
0
def test_map_called_on_data_star():
    r = data(example('accounts_*.csv'))
    s = symbol('s', discover(r))
    flag[0] = False
    a = compute(s.count(), r)
    b = compute(s.count(), r, map=mymap)
    assert a == b
    assert flag[0]
Пример #30
0
def test_summary_on_series():
    ser = dd.from_pandas(pd.Series([1, 2, 3]), npartitions=2)
    s = symbol('s', '3 * int')
    expr = summary(max=s.max(), min=s.min())
    assert compute(expr, ser) == (3, 1)

    expr = summary(max=s.max(), min=s.min(), keepdims=True)
    assert compute(expr, ser) == [(3, 1)]
Пример #31
0
def test_scope_gets_updated_after_optimize_call():
    a = symbol('a', 'int')
    result = compute(a + 1, Foo('foo'), optimize=optimize)
    assert result.data == 'foo'
Пример #32
0
def test_boolean(ctx, db):
    expr = db.t.amount > 50
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result, dshape=expr.dshape) == into(set, expected)
Пример #33
0
def test_nunique_spark_dataframe(ctx, db):
    result = odo(compute(db.t.nunique(), ctx, return_type='native'), int)
    expected = ctx.table('t').distinct().count()
    assert result == expected
Пример #34
0
def test_nyc_csv(nyc_csv):
    t = symbol('t', discover(nyc_csv))
    assert compute(t.nrows, nyc_csv, return_type='core') > 0
Пример #35
0
def test_projection(db, ctx):
    expr = db.t[['id', 'name']]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result) == into(set, expected)
Пример #36
0
def test_head(db, ctx):
    expr = db.t[['name', 'amount']].head(2)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(list, result) == into(list, expected)
Пример #37
0
def test_field_distinct(ctx, db):
    expr = db.t.name.distinct()
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result, dshape=expr.dshape) == into(set, expected)
Пример #38
0
def test_summary_complex_arith(bank):
    expr = by(t.name, arith=(100 - t.amount * 2 / 30.0).sum())
    result = compute(expr, bank)
    reducer = lambda acc, x: (100 - x['amount'] * 2 / 30.0) + acc
    expected = reduceby('name', reducer, bank.find(), 0)
    assert set(result) == set(expected.items())
Пример #39
0
def test_sample_n(nyc):
    t = symbol('t', discover(nyc))
    result = compute(t.sample(n=14), nyc, return_type=pd.DataFrame)
    assert len(result) == 14
Пример #40
0
def test_join_type_promotion(sqla, sqlb):
    t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb))
    expr = join(t, s, 'B', how='inner')
    result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}, return_type='native').execute().fetchall()))
    expected = set([(1, 'a', 'a'), (1, None, 'a')])
    assert result == expected
Пример #41
0
def test_relabel_columns_over_selection(big_sql):
    t = symbol('t', discover(big_sql))
    result = compute(t[t['B'] == 2].relabel(B=u'b'),
                     big_sql, return_type=pd.DataFrame)
    expected = pd.DataFrame([['a', 2]], columns=[u'A', u'b'])
    tm.assert_frame_equal(result, expected)
Пример #42
0
def test_postgres_isnan(sql_with_float):
    dta = (1.0,), (float('nan'),)
    table = odo(dta, sql_with_float)
    sym = symbol('s', discover(dta))
    assert compute(sym.isnan(), table, return_type=list) == [(False,), (True,)]
Пример #43
0
def test_by_summary(db, ctx):
    t = db.t
    expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result) == into(set, expected)
Пример #44
0
def test_reductions(ctx, db, field, reduction):
    expr = getattr(db.t[field], reduction)()
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(list, result)[0][0] == expected
Пример #45
0
def test_symbol_compute(db, ctx):
    assert isinstance(compute(db.t, ctx, return_type='native'), SparkDataFrame)
Пример #46
0
def test_selection_field(ctx, db):
    expr = db.t[db.t.amount > 50].name
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result, dshape=expr.dshape) == into(set, expected)
Пример #47
0
def test_compute_chunks_on_single_csv():
    csv = CSV(example('iris.csv'))
    s = symbol('s', discover(csv))
    expr = s.sepal_length.max()
    assert compute(expr, {s: csv}, comfortable_memory=10, chunksize=50) == 7.9
Пример #48
0
def test_like_mulitple_no_match(big_bank):
    # make sure we aren't OR-ing the matches
    expr = bigt.like(name='*York*', city='*Bob*')
    result = compute(expr, big_bank)
    assert not set(result)
Пример #49
0
def test_floor_ceil(bank):
    t = symbol('t', discover(bank))
    assert set(compute(200 * floor(t.amount / 200), bank)) == set([0, 200])
    assert set(compute(200 * ceil(t.amount / 200), bank)) == set([200, 400])
Пример #50
0
def test_core_compute(ctx, db):
    assert isinstance(compute(db.t, ctx, return_type='core'), pd.DataFrame)
    assert isinstance(compute(db.t.amount, ctx, return_type='core'), pd.Series)
    assert iscorescalar(compute(db.t.amount.mean(), ctx, return_type='core'))
    assert isinstance(compute(db.t, ctx, return_type=list), list)
Пример #51
0
def test_missing_values(missing_vals):
    assert discover(missing_vals).subshape[0] == \
            dshape('{x: int64, y: ?int64, z: ?int64}')

    assert set(compute(p.y, missing_vals)) == set([None, 20, None, 40])
Пример #52
0
def test_like(bank):
    bank.create_index([('name', pymongo.TEXT)])
    expr = t.like(name='*Alice*')
    result = compute(expr, bank)
    assert set(result) == set((('Alice', 100), ('Alice', 200)))
Пример #53
0
def test_like_multiple(big_bank):
    expr = bigt.like(name='*Bob*', city='*York*')
    result = compute(expr, big_bank)
    assert set(result) == set(
        (('Bob', 100, 'New York City'), ('Bob', 200, 'New York City')))
Пример #54
0
def test_by_non_native_ops(ctx, db):
    expr = by(db.t.id, total=db.t.id.nunique())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
Пример #55
0
def test_map(ctx, db):
    expr = db.t.id.map(lambda x: x + 1, 'int')
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result, dshape=expr.dshape) == into(set, expected)
Пример #56
0
def test_isin(ctx, db, keys):
    expr = db.t[db.t.id.isin(keys)]
    result = odo(compute(expr, ctx, return_type='native'), set)
    expected = odo(compute(expr, {db: {'t': df}}, return_type='native'), set)
    assert (set(map(frozenset, odo(result, list))) ==
            set(map(frozenset, odo(expected, list))))
Пример #57
0
def test_column_arithmetic(ctx, db):
    expr = db.t.amount + 1
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert into(set, result, dshape=expr.dshape) == into(set, expected)
Пример #58
0
def test_grouper_with_arith(ctx, db):
    expr = by(db.t[['id', 'amount']], total=(db.t.amount + 1).sum())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(map(set, into(list, expected)))
Пример #59
0
def test_sample_bounded(nyc):
    t = symbol('t', discover(nyc))
    nrows = compute(t.nrows, nyc, return_type=int)
    result = compute(t.sample(n=2*nrows), nyc, return_type=pd.DataFrame)
    assert len(result) == nrows
Пример #60
0
def test_core_compute(nyc):
    t = symbol('t', discover(nyc))
    assert isinstance(compute(t, nyc, return_type='core'), pd.DataFrame)
    assert isinstance(compute(t.passenger_count, nyc, return_type='core'), pd.Series)
    assert iscorescalar(compute(t.passenger_count.mean(), nyc, return_type='core'))
    assert isinstance(compute(t, nyc, return_type=list), list)