Пример #1
0
def test_csv_into_mongodb_columns(empty_collec, file_name):
    csv = CSV(file_name, schema='{x: int, y: int}')

    coll = empty_collec

    lhs = into(list, csv)
    assert lhs == into(list, into(coll, csv))
Пример #2
0
def test_literals(db, ctx):
    expr = db.t[db.t.amount >= 100]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #3
0
def test_sort(ctx, db, field, ascending):
    expr = db.t.sort(field, ascending=ascending)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #4
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Пример #5
0
def test_selection(ctx, db):
    expr = db.t[db.t.amount > 50]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #6
0
def test_by(ctx, db, grouper, reducer, reduction):
    t = db.t
    expr = by(t[grouper], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Пример #7
0
def test_multikey_by(ctx, db, reducer, reduction):
    t = db.t
    expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)())
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert (set(map(frozenset, into(list, result))) ==
            set(map(frozenset, into(list, expected))))
Пример #8
0
def test_sort(ctx, db, field, ascending):
    expr = db.t.sort(field, ascending=ascending)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #9
0
def test_csv_into_mongodb_colon_del(empty_collec, file_name_colon):
    csv = CSV(file_name_colon)
    coll = empty_collec
    lhs = into(list, csv)
    newcoll = into(coll, csv)
    rhs = into(list, newcoll)
    assert lhs == rhs
Пример #10
0
def test_selection(ctx, db):
    expr = db.t[db.t.amount > 50]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #11
0
def test_csv_into_mongodb_colon_del(empty_collec, file_name_colon):
    csv = CSV(file_name_colon)
    coll = empty_collec
    lhs = into(list, csv)
    newcoll = into(coll, csv)
    rhs = into(list, newcoll)
    assert lhs == rhs
Пример #12
0
def test_csv_into_mongodb_columns(empty_collec, file_name):
    csv = CSV(file_name, schema='{x: int, y: int}')

    coll = empty_collec

    lhs = into(list, csv)
    assert lhs == into(list, into(coll, csv))
Пример #13
0
def test_literals(db, ctx):
    expr = db.t[db.t.amount >= 100]
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert list(map(set, into(list, result))) == list(
        map(set, into(list, expected))
    )
Пример #14
0
def test_failing_argument():

    tbl = 'testtable_into_2'

    csv = CSV(file_name, columns=['a', 'b'])
    sql = resource(url + '::' + tbl, dshape=csv.dshape)

    into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
Пример #15
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, (SparkDataFrame, SchemaRDD))
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #16
0
def test_repr_hdma():
    csv = CSV(example('hmda-small.csv'))
    t = TableSymbol('hmda', csv.schema)

    assert len(into(list, compute(t.head(), csv))) == 10

    columns = ['action_taken_name', 'agency_abbr', 'applicant_ethnicity_name']
    assert into(list, compute(t[columns].head(), csv))
Пример #17
0
def test_csv_into_mongodb(empty_collec):
    csv = CSV(file_name)

    coll = empty_collec
    into(coll,csv)
    mongo_data = list(coll.find({},{'_0': 1, '_id': 0}))

    assert list(csv[:,'_0']) == [i['_0'] for i in mongo_data]
Пример #18
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native')

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #19
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native')

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #20
0
def test_join_diff_contexts(db, ctx, cities):
    expr = join(db.t, db.s, 'name')
    people = ctx.table('t')
    cities = into(SchemaRDD, cities, dshape=discover(ctx.table('s')))
    scope = {db: {'t': people, 's': cities}}
    result = compute(expr, scope)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})
    assert into(set, result) == into(set, expected)
Пример #21
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, (SparkDataFrame, SchemaRDD))
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #22
0
def test_sql_new_schema():
    with non_existing_schema('myschema2'):
        sql = SQL(url, 'accounts', schema_name='myschema2', schema='{name: string, value: int}')
        into(sql, data)
        assert engine.has_table('accounts', schema='myschema2')

        sql2 = SQL(url, 'accounts', schema_name='myschema2')
        assert list(sql2) == data
Пример #23
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {"t": df, "s": cities_df}})

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Пример #24
0
def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected
Пример #25
0
def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected
Пример #26
0
def test_get_datetimes():
    response = test.post('/compute.json',
                         data=json.dumps({'expr': 'events'}),
                         content_type='application/json')

    assert 'OK' in response.status
    data = json.loads(response.data)
    result = nd.array(data['data'], type=data['datashape'])
    assert into(list, result) == into(list, events)
Пример #27
0
def test_no_header_no_columns():
    tbl = 'testtable_into_2'

    csv = CSV(file_name)
    sql = resource(url + '::' + tbl, dshape=csv.dshape)

    into(sql, csv, if_exists="replace")

    assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
Пример #28
0
    def test_csv_json_chunked(self):
        with filetext('1,1\n2,2\n') as csv_fn:
            with filetext('') as json_fn:
                schema = '{a: int32, b: int32}'
                csv = CSV(csv_fn, schema=schema)
                json = JSON_Streaming(json_fn, mode='r+', schema=schema)

                into(json, csv)

                self.assertEquals(tuplify(tuple(json)), ((1, 1), (2, 2)))
Пример #29
0
def test_simple_into():

    tbl = 'testtable_into_2'

    csv = CSV(file_name, columns=['a', 'b'])
    sql = resource(url + '::' + tbl, dshape=csv.dshape)

    into(sql, csv, if_exists="replace")

    assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
Пример #30
0
def test_simple_float_into():
    tbl = 'testtable_into_float'

    csv = CSV(file_name_floats, columns=['a', 'b'])
    sql = resource(url + '::' + tbl, dshape=csv.dshape)

    into(sql,csv, if_exists="replace")

    assert into(list, sql) == \
            [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
Пример #31
0
def test_tryexcept_into():

    tbl = 'testtable_into_2'

    csv = CSV(file_name, columns=['a', 'b'])
    sql = resource(url + '::' + tbl, dshape=csv.dshape)

    into(sql, csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and
                                                      # fails over to using sql.extend()

    assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
Пример #32
0
    def test_csv_hdf5(self):
        from dynd import nd
        with tmpfile('hdf5') as hdf5_fn:
            with filetext('1,1\n2,2\n') as csv_fn:
                csv = CSV(csv_fn, schema='{a: int32, b: int32}')
                hdf5 = resource(hdf5_fn + '::/data', dshape='var * {a: int32, b: int32}')

                into(hdf5, csv)

                self.assertEquals(hdf5[:].tolist(),
                                  [(1, 1), (2, 2)])
Пример #33
0
def test_sql_schema_behavior():
    with existing_schema('myschema'):
        sql = SQL(url, 'accounts', schema_name='myschema', schema='{name: string, value: int}')
        into(sql, data)
        assert engine.has_table('accounts', schema='myschema')

        sql2 = SQL(url, 'accounts', schema_name='myschema')
        assert list(sql2) == data

        sql3 = SQL(url, 'myschema.accounts')
        assert list(sql2) == data
Пример #34
0
def test_leaf_symbol(test, serial):
    query = {'expr': {'op': 'Field', 'args': [':leaf', 'cities']}}
    resp = test.post('/compute',
                     data=serial.dumps(query),
                     headers=mimetype(serial))

    tdata = serial.loads(resp.data)
    a = serial.data_loads(tdata['data'])
    b = into(list, cities)

    assert list(map(tuple, into(list, a))) == b
    assert list(tdata['names']) == cities.columns.tolist()
Пример #35
0
def test_hour():
    dts = [datetime(2000, 6, 20,  1, 00, 00),
           datetime(2000, 6, 20, 12, 59, 59),
           datetime(2000, 6, 20, 12, 00, 00),
           datetime(2000, 6, 20, 11, 59, 59)]
    dts = into(np.ndarray, dts)

    assert eq(compute(s.truncate(1, 'hour'), dts),
            into(np.ndarray, [datetime(2000, 6, 20,  1, 0),
                              datetime(2000, 6, 20, 12, 0),
                              datetime(2000, 6, 20, 12, 0),
                              datetime(2000, 6, 20, 11, 0)]))
Пример #36
0
def test_month():
    dts = [datetime(2000, 7, 1),
           datetime(2000, 6, 30),
           datetime(2000, 6, 1),
           datetime(2000, 5, 31)]
    dts = into(np.ndarray, dts)

    assert eq(compute(s.truncate(1, 'month'), dts),
            into(np.ndarray, [date(2000, 7, 1),
                              date(2000, 6, 1),
                              date(2000, 6, 1),
                              date(2000, 5, 1)]))
Пример #37
0
def test_leaf_symbol(test, serial):
    query = {'expr': {'op': 'Field', 'args': [':leaf', 'cities']}}
    resp = test.post('/compute',
                     data=serial.dumps(query),
                     headers=mimetype(serial))

    tdata = serial.loads(resp.data)
    a = serial.data_loads(tdata['data'])
    b = into(list, cities)

    assert list(map(tuple, into(list, a))) == b
    assert list(tdata['names']) == cities.columns.tolist()
Пример #38
0
 def test_ndarray_into_table(self, dt_tb, dt_data):
     dtype = ds.from_numpy(dt_data.shape, dt_data.dtype)
     t = PyTables(dt_tb, '/out', dtype)
     try:
         res = into(np.ndarray, into(t, dt_data, filename=dt_tb, datapath='/out'))
         for k in res.dtype.fields:
             lhs, rhs = res[k], dt_data[k]
             if (issubclass(np.datetime64, lhs.dtype.type) and
                 issubclass(np.datetime64, rhs.dtype.type)):
                 lhs, rhs = lhs.astype('M8[us]'), rhs.astype('M8[us]')
             assert np.array_equal(lhs, rhs)
     finally:
         t._v_file.close()
Пример #39
0
def test_get_datetimes(app_context):
    expr = t.events
    query = {'expr': to_tree(expr)}

    response = test.post('/compute.json',
                         data=json.dumps(query),
                         content_type='application/json')

    assert 'OK' in response.status
    data = json.loads(response.data.decode('utf-8'))
    ds = datashape.dshape(data['datashape'])
    result = into(np.ndarray, data['data'], dshape=ds)
    assert into(list, result) == into(list, events)
Пример #40
0
def test_get_datetimes():
    expr = t.events
    query = {'expr': to_tree(expr)}

    response = test.post('/compute.json',
                         data=json.dumps(query),
                         content_type='application/json')

    assert 'OK' in response.status
    data = json.loads(response.data.decode('utf-8'))
    ds = datashape.dshape(data['datashape'])
    result = into(np.ndarray, data['data'], dshape=ds)
    assert into(list, result) == into(list, events)
Пример #41
0
def test_get_datetimes(serial):
    expr = t.events
    query = {'expr': to_tree(expr)}

    response = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=serial.dumps(query),
    )

    assert 'OK' in response.status
    data = serial.loads(response.data)
    ds = datashape.dshape(data['datashape'])
    result = into(np.ndarray, data['data'], dshape=ds)
    assert into(list, result) == into(list, events)
Пример #42
0
def test_get_datetimes(test, serial):
    expr = t.events
    query = {'expr': to_tree(expr)}

    response = test.post('/compute',
                         data=serial.dumps(query),
                         headers=mimetype(serial))

    assert 'OK' in response.status
    tdata = serial.loads(response.data)
    ds = datashape.dshape(tdata['datashape'])
    result = into(np.ndarray, serial.data_loads(tdata['data']), dshape=ds)
    assert into(list, result) == into(list, events)
    assert list(tdata['names']) == events.columns.tolist()
Пример #43
0
def test_multiple_csv_files():
    d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"}

    data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)]
    with filetexts(d) as fns:
        r = resource("mult*.csv")
        s = symbol("s", discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: data})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Пример #44
0
def test_get_datetimes(test, serial):
    expr = t.events
    query = {'expr': to_tree(expr)}

    response = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=serial.dumps(query),
    )

    assert 'OK' in response.status
    data = serial.loads(response.data)
    ds = datashape.dshape(data['datashape'])
    result = into(np.ndarray, data['data'], dshape=ds)
    assert into(list, result) == into(list, events)
    assert data['names'] == events.columns.tolist()
Пример #45
0
def test_compute_column_wise(iris_server, serial):
    test = iris_server
    t = symbol('t', discover(iris))
    subexpr = ((t.petal_width / 2 > 0.5) & (t.petal_length / 2 > 0.5))
    expr = t[subexpr]
    tree = to_tree(expr)
    blob = serial.dumps({'expr': tree})
    resp = test.post('/compute', data=blob, headers=mimetype(serial))

    assert 'OK' in resp.status
    tdata = serial.loads(resp.data)
    result = serial.data_loads(tdata['data'])
    expected = compute(expr, iris)
    assert list(map(tuple, into(list, result))) == into(list, expected)
    assert list(tdata['names']) == t.fields
Пример #46
0
def test_unused_datetime_columns():
    ds = dshape('2 * {val: string, when: datetime}')
    with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn:
        csv = CSV(fn, has_header=True)

        s = symbol('s', discover(csv))
        assert into(list, compute(s.val, csv)) == ['a', 'b']
Пример #47
0
def test_multiple_csv_files():
    d = {'mult1.csv': 'name,val\nAlice,1\nBob,2',
         'mult2.csv': 'name,val\nAlice,3\nCharlie,4'}

    dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)]
    with filetexts(d) as fns:
        r = data('mult*.csv')
        s = symbol('s', discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(),
                s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: dta})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Пример #48
0
def test_sparksql_with_literals():
    srdd = into(sqlContext, data, schema=t.schema)
    expr = t[t.amount >= 100]
    result = compute(expr, srdd)
    assert isinstance(result, SchemaRDD)
    assert set(map(tuple, result.collect())) == \
            set(map(tuple, compute(expr, data)))
Пример #49
0
    def update_source(self):

        # Relevant
        df = pd.read_csv(self.relevant_data, delimiter='\t', header=None, names=['url', 'timestamp'])
        df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True))
        df1 = df.groupby(['domain']).size()

        # Crawled
        df = pd.read_csv(self.crawled_data, delimiter='\t', header=None, names=['url', 'timestamp'])
        df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True))
        df2 = df.groupby(['domain']).size()

        # Frontier
        df = pd.read_csv(self.frontier_data, delimiter='\t', header=None, names=['url'])
        df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True))
        df3 = df.groupby(['domain']).size()

        df = pd.concat((df1, df2, df3), axis=1)
        df.columns = ['relevant', 'crawled', 'frontier']

        df = df.sort(self.sort, ascending=False).head(25).fillna(value=0)

        for col in df.columns:
            df['%s_half' % col] = df[col] / 2

        df.reset_index(inplace=True)

        source = into(ColumnDataSource, df)
        return source
Пример #50
0
def bank(db):
    coll = db.bank
    coll = into(coll, bank_raw)
    try:
        yield coll
    finally:
        coll.drop()
Пример #51
0
def test_field_access(db, ctx):
    for field in db.t.fields:
        expr = getattr(db.t, field)
        result = into(pd.Series, compute(expr, ctx))
        expected = compute(expr, {db: {'t': df}})
        assert result.name == expected.name
        np.testing.assert_array_equal(result.values, expected.values)
Пример #52
0
def comp(datasets, name):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        data = json.loads(request.data)
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)

    try:
        dset = datasets[name]
    except KeyError:
        return ("Dataset %s not found" % name, 404)

    t = Symbol(name, discover(dset))
    namespace = data.get('namespace', dict())
    namespace[name] = t

    expr = from_tree(data['expr'], namespace=namespace)

    result = compute(expr, dset)
    if iscollection(expr.dshape):
        result = into(list, result)
    return jsonify({
        'name': name,
        'datashape': str(expr.dshape),
        'data': result
    })
Пример #53
0
def big_bank(db):
    data = [{
        'name': 'Alice',
        'amount': 100,
        'city': 'New York City'
    }, {
        'name': 'Alice',
        'amount': 200,
        'city': 'Austin'
    }, {
        'name': 'Bob',
        'amount': 100,
        'city': 'New York City'
    }, {
        'name': 'Bob',
        'amount': 200,
        'city': 'New York City'
    }, {
        'name': 'Bob',
        'amount': 300,
        'city': 'San Francisco'
    }]
    coll = db.bigbank
    coll = into(coll, data)
    try:
        yield coll
    finally:
        coll.drop()
Пример #54
0
def test_csv_into_mongodb(empty_collec, file_name):
    csv = CSV(file_name)
    coll = empty_collec
    res = into(coll, csv)
    mongo_data = list(res.find({}, {'_0': 1, '_id': 0}))

    assert list(csv[:, '_0']) == [i['_0'] for i in mongo_data]
Пример #55
0
def test_jsonarray_into_mongodb(empty_collec):

    filename = tempfile.mktemp(".json")
    with open(filename, "w") as f:
        json.dump(data, f)

    dd = JSON(filename,
              schema="3 * { id : string, name : string, "
              "posts : var * { content : string, title : string },"
              " tv_show : string }")
    coll = empty_collec
    into(coll, dd, json_array=True)

    mongo_data = list(coll.find({}, {'_id': 0}))

    assert mongo_data[0] == data[0]
Пример #56
0
def compserver(dataset):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        payload = json.loads(request.data.decode('utf-8'))
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)

    ns = payload.get('namespace', dict())
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})
    except Exception as e:
        return ("Computation failed with message:\n%s" % e, 500)

    if iscollection(expr.dshape):
        result = into(list, result)

    return json.dumps({
        'datashape': str(expr.dshape),
        'data': result
    },
                      default=json_dumps)
Пример #57
0
def test_string_dataset(tmpcsv):
    raw = 'a,b,2.0\nc,1999,3.0\nd,3.0,4.0'
    with open(tmpcsv, mode='w') as f:
        f.write(raw)
    csv = CSV(tmpcsv, columns=list('xyz'))
    t = Table(csv)
    x = into(list, t)
    assert x == [('a', 'b', 2.0), ('c', '1999', 3.0), ('d', '3.0', 4.0)]
Пример #58
0
def test_expr_client_interactive():
    ec = Client('localhost:6363', 'accounts')
    t = Table(ec)

    assert compute(t.name) == ['Alice', 'Bob']
    assert (into(set, compute(by(t.name, min=t.amount.min(),
                                         max=t.amount.max()))) ==
            set([('Alice', 100, 100), ('Bob', 200, 200)]))