def test_resource(): with tmpfile('json') as fn: assert isinstance(resource('jsonlines://' + fn), JSONLines) assert isinstance(resource('json://' + fn), JSON) assert isinstance( resource(fn, expected_dshape=dshape('var * {a: int}')), JSONLines)
def tmp_db_uri(): """Create a temporary postgres database to run the tests against. """ db_name = '_warp_prism_test_' + uuid4().hex root = 'postgresql://localhost/' uri = root + db_name with disposable_engine(root + 'postgres') as e, e.connect() as root_conn: root_conn.execute('COMMIT') root_conn.execute('CREATE DATABASE %s' % db_name) try: yield uri finally: resource(uri).dispose() try: _dropdb(root_conn, db_name) except sa.exc.OperationalError: # We couldn't drop the db. The most likely cause is that there # are active queries. Even more likely is that these are # rollbacks because there was an exception somewhere inside the # tests. We will cancel all the running queries and try to drop # the database again. pid = _pg_stat_activity.c.pid root_conn.execute( sa.select((sa.func.pg_terminate_backend(pid), ), ).where( pid != sa.func.pg_backend_pid(), )) try: _dropdb(root_conn, db_name) except sa.exc.OperationalError: # pragma: no cover # The database STILL wasn't cleaned up. Just tell the user # to deal with this manually. warnings.warn( "leaking database '%s', please manually delete this" % db_name, )
def test_foreign_keys_as_compound_primary_key(): with tmpfile('db') as fn: suppliers = resource( 'sqlite:///%s::suppliers' % fn, dshape='var * {id: int64, name: string}', primary_key=['id'] ) parts = resource( 'sqlite:///%s::parts' % fn, dshape='var * {id: int64, name: string, region: string}', primary_key=['id'] ) suppart = resource( 'sqlite:///%s::suppart' % fn, dshape='var * {supp_id: map[int64, T], part_id: map[int64, U]}', foreign_keys={ 'supp_id': suppliers.c.id, 'part_id': parts.c.id }, primary_key=['supp_id', 'part_id'] ) expected = dshape(""" var * { supp_id: map[int64, {id: int64, name: string}], part_id: map[int64, {id: int64, name: string, region: string}] } """) result = discover(suppart) assert result == expected
def test_compound_primary_key_with_single_reference(): with tmpfile('db') as fn: products = resource('sqlite:///%s::products' % fn, dshape=""" var * { product_no: int32, product_sku: string, name: ?string, price: ?float64 } """, primary_key=['product_no', 'product_sku']) # TODO: should this fail everywhere? e.g., this fails in postgres, but # not in sqlite because postgres doesn't allow partial foreign keys # might be best to let the backend handle this ds = dshape("""var * { order_id: int32, product_no: map[int32, T], quantity: ?int32 }""") orders = resource('sqlite:///%s::orders' % fn, dshape=ds, foreign_keys=dict(product_no=products.c.product_no), primary_key=['order_id']) assert discover(orders) == dshape( """var * { order_id: int32, product_no: map[int32, {product_no: int32, product_sku: string, name: ?string, price: ?float64}], quantity: ?int32 } """ )
def test_discover_foreign_keys(): with tmpfile('db') as fn: products = resource('sqlite:///%s::products' % fn, dshape=""" var * { product_no: int32, name: ?string, price: ?float64 } """, primary_key=['product_no']) expected = dshape("""var * { order_id: int32, product_no: map[int32, { product_no: int32, name: ?string, price: ?float64 }], quantity: ?int32 }""") orders = resource('sqlite:///%s::orders' % fn, dshape=expected, foreign_keys=dict(product_no=products.c.product_no)) result = discover(orders) assert result == expected
def test_compound_primary_key_with_fkey(): with tmpfile('db') as fn: products = resource('sqlite:///%s::products' % fn, dshape=""" var * { product_no: int32, product_sku: string, name: ?string, price: ?float64 } """, primary_key=['product_no', 'product_sku']) ds = dshape("""var * { order_id: int32, product_no: map[int32, T], product_sku: map[int32, U], quantity: ?int32 }""") orders = resource('sqlite:///%s::orders' % fn, dshape=ds, primary_key=['order_id'], foreign_keys={ 'product_no': products.c.product_no, 'product_sku': products.c.product_sku }) assert discover(orders) == dshape( """var * { order_id: int32, product_no: map[int32, {product_no: int32, product_sku: string, name: ?string, price: ?float64}], product_sku: map[int32, {product_no: int32, product_sku: string, name: ?string, price: ?float64}], quantity: ?int32 } """ )
def test_resource(): with tmpfile('json') as fn: assert isinstance(resource('jsonlines://' + fn), JSONLines) assert isinstance(resource('json://' + fn), JSON) assert isinstance(resource(fn, expected_dshape=dshape('var * {a: int}')), JSONLines)
def test_foreign_keys_auto_construct(): with tmpfile('db') as fn: products = resource('sqlite:///%s::products' % fn, dshape=""" var * { product_no: int32, name: ?string, price: ?float64 } """, primary_key=['product_no']) ds = dshape("""var * { order_id: int32, product_no: map[int32, T], quantity: ?int32 }""") orders = resource('sqlite:///%s::orders' % fn, dshape=ds, foreign_keys=dict(product_no=products.c.product_no), primary_key=['order_id']) assert discover(orders) == dshape(""" var * { order_id: int32, product_no: map[int32, { product_no: int32, name: ?string, price: ?float64 }], quantity: ?int32 } """)
def test_resource_gzip(): with tmpfile('json.gz') as fn: assert isinstance(resource(fn), (JSON, JSONLines)) assert isinstance(resource('json://' + fn), (JSON, JSONLines)) assert isinstance(resource('jsonlines://' + fn), (JSON, JSONLines)) with tmpfile('jsonlines.gz'): assert isinstance(resource('jsonlines://' + fn), (JSON, JSONLines))
def test_small_chunk_size(): normal = convert(Temp(CSV), resource(iris_url)) small_chunk = convert(Temp(CSV), resource(iris_url, chunk_size=1)) with open(normal.path, 'rb') as fn: normal_data = fn.read() with open(small_chunk.path, 'rb') as fn: small_chunk_data = fn.read() assert normal_data == small_chunk_data
def test_engine_metadata_caching(): with tmpfile('db') as fn: engine = resource('sqlite:///' + fn) a = resource('sqlite:///' + fn + '::a', dshape=dshape('var * {x: int}')) b = resource('sqlite:///' + fn + '::b', dshape=dshape('var * {y: int}')) assert a.metadata is b.metadata assert engine is a.bind is b.bind
def test_sample_different_line_counts(): with sample(resource(iris_url), lines=10) as fn: with open(fn, 'r') as f: assert len(list(f)) == 10 with sample(resource(iris_url), lines=5) as fn: with open(fn, 'r') as f: assert len(list(f)) == 5
def test_engine_metadata_caching(): with tmpfile("db") as fn: engine = resource("sqlite:///" + fn) a = resource("sqlite:///" + fn + "::a", dshape=dshape("var * {x: int}")) b = resource("sqlite:///" + fn + "::b", dshape=dshape("var * {y: int}")) assert a.metadata is b.metadata assert engine is a.bind is b.bind
def test_resource_on_file(): with tmpfile(".db") as fn: uri = "sqlite:///" + fn sql = resource(uri, "foo", dshape="var * {x: int, y: int}") assert isinstance(sql, sa.Table) with tmpfile(".db") as fn: uri = "sqlite:///" + fn sql = resource(uri + "::" + "foo", dshape="var * {x: int, y: int}") assert isinstance(sql, sa.Table)
def test_foreign_keys_bad_field(): with tmpfile('db') as fn: expected = dshape("""var * { order_id: int32, product_no: int64, quantity: ?int32 }""") with pytest.raises(TypeError): resource('sqlite:///%s::orders' % fn, dshape=expected, foreign_keys=dict(foo='products.product_no'))
def test_engine_metadata_caching(): with tmpfile('db') as fn: engine = resource('sqlite:///' + fn) a = resource( 'sqlite:///' + fn + '::a', dshape=dshape('var * {x: int}')) b = resource( 'sqlite:///' + fn + '::b', dshape=dshape('var * {y: int}')) assert a.metadata is b.metadata assert engine is a.bind is b.bind
def test_drop_reflects_database_state(url): data = list(zip(range(5), range(1, 6))) t = odo(data, url, dshape='var * {A: int64, B: int64}') assert t.exists() assert resource(url).exists() drop(url) with pytest.raises(ValueError): resource(url) # Table doesn't exist and no dshape
def test_resource_on_file(): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri, 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table) with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri + '::' + 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table)
def test_invalid_foreign_keys(): with tmpfile('db') as fn: expected = dshape("""var * { order_id: int32, product_no: map[int32, { product_no: int32, name: ?string, price: ?float64 }], quantity: ?int32 }""") with pytest.raises(TypeError): resource('sqlite:///%s::orders' % fn, dshape=expected)
def sql_two_tables(url): dshape = 'var * {a: int32}' try: t = resource(url % next(names), dshape=dshape) u = resource(url % next(names), dshape=dshape) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield u, t finally: drop(t) drop(u)
def test_join_count(): ds = datashape.dshape('{t1: var * {x: int, y: int}, t2: var * {a: int, b: int}}') engine = resource('sqlite:///:memory:', dshape=ds) db = symbol('db', ds) expr = join(db.t1[db.t1.x > -1], db.t2, 'x', 'a').count() result = compute(expr, {db: engine}, post_compute=False) expected1 = """ SELECT count(alias.x) as count FROM (SELECT t1.x AS x, t1.y AS y, t2.b AS b FROM t1 JOIN t2 ON t1.x = t2.a WHERE t1.x > ?) as alias """ expected2 = """ SELECT count(alias2.x) AS __count FROM (SELECT alias1.x AS x, alias1.y AS y, t2.b AS b FROM (SELECT t1.x AS x, t1.y AS y FROM t1 WHERE t1.x > ?) AS alias1 JOIN t2 ON alias1.x = t2.a) AS alias2""" assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_resource_no_info(): with tmpfile('.hdf5') as fn: r = resource('hdfstore://' + fn) try: assert isinstance(r, pd.HDFStore) finally: r.close()
def test_fixed_shape(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://' + fn + '::/foo') assert isinstance(r.shape, list) assert discover(r).shape == (len(df), ) r.parent.close()
def test_resource(): res = resource('cql://test-host:9042/keyspace::table') assert isinstance(res, Cassandra) assert res.host == 'test-host' assert res.port == '9042' assert res.keyspace == 'keyspace' assert res.table == 'table'
def test_simple_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def from_uri( cls, uri: str, index_col: bool = False, source: Optional[DataObject] = None, **kwargs ) -> "JsonFileDataset": resource = odo.resource(uri) methods = [ lambda: JsonFileDataset._read_normalized(uri, **kwargs), lambda: JsonFileDataset._read_normalized_lines(uri, **kwargs), lambda: pd.read_json(uri, lines=True, **kwargs), lambda: odo.odo(uri, pd.DataFrame, index_col=index_col, **kwargs), ] result = None for method in methods: try: data = method() result = cls(inner_data=data, uri=uri, source=source, **kwargs) break except: pass if result: if not result.name: result.inner_data.name = os.path.splitext(os.path.basename(uri))[0] return result raise RuntimeError("No JSON reading method understands the file.")
def test_resource(): sql = resource('sqlite:///:memory:::mytable', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table) assert sql.name == 'mytable' assert isinstance(sql.bind, sa.engine.base.Engine) assert set(c.name for c in sql.c) == set(['x', 'y'])
def correct_commodities(): src_dir = path.join(data_dir, 'agmarknet/by_commodity') init_dir = os.getcwd() os.chdir(src_dir) folders = glob.glob('*') csv_dir = os.getcwd() for folder in folders: os.chdir(path.join(csv_dir, folder)) files = glob.glob('*_all.csv') for file in files: csvr = odo.resource(path.join(csv_dir, folder, file)) # Have to use resource to discover URIs num_col = len(odo.discover(csvr)[1].types) ds = None if num_col == 9: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, min: ?string, max: ?string, modal: ?string}") elif num_col == 10: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, grade: ?string, min: ?string, max: ?string, modal: ?string}") else: ds = odo.discover(csvr) d = bz.Data(path.join(csv_dir, folder, file), dshape=ds) if num_col == 10: d = bz.transform(d, grade=d.grade.map(lambda x: x.strip(), 'string')) d = bz.transform(d, commodity=d.commodity.map(lambda x: x.strip(), 'string')) d = bz.transform(d, commodity=d.commodity.map(lambda x: commodity_corrections[x] if x in commodity_corrections else x, 'string')) d = bz.transform(d, state=d.state.map(lambda x: x.strip(), 'string')) d = bz.transform(d, state=d.state.map(lambda x: state_corrections[x] if x in state_corrections else x, 'string')) d = bz.transform(d, market=d.market.map(lambda x: x.strip(), 'string')) return
def test_transaction(): with tmpfile('.db') as fn: rsc = resource('sqlite:///%s::table' % fn, dshape='var * {a: int}') data = [(1,), (2,), (3,)] conn_1 = rsc.bind.connect() conn_2 = rsc.bind.connect() trans_1 = conn_1.begin() conn_2.begin() odo(data, rsc, bind=conn_1) # inside the transaction the write should be there assert odo(rsc, list, bind=conn_1) == data # outside of a transaction or in a different transaction the write is not # there assert odo(rsc, list) == odo(rsc, list, bind=conn_2) == [] trans_1.commit() # now the data should appear outside the transaction assert odo(rsc, list) == odo(rsc, list, bind=conn_2) == data
def test_resource_collection(mongo_host_port): host, port = mongo_host_port coll = resource('mongodb://{}:{}/db::mycoll'.format(*mongo_host_port)) assert coll.name == 'mycoll' assert coll.database.name == 'db' assert coll.database.connection.host == host assert coll.database.connection.port == port
def s3_bucket(extension): with conn(): b = 's3://%s/%s%s' % (test_bucket_name, next(_tmps), extension) try: yield b finally: drop(resource(b))
def test_numeric_create(): tbl = resource('sqlite:///:memory:::test', dshape='var * {a: ?decimal[11, 2], b: decimal[10, 6]}') assert tbl.c.a.nullable assert not tbl.c.b.nullable assert isinstance(tbl.c.a.type, sa.NUMERIC) assert isinstance(tbl.c.b.type, sa.NUMERIC)
def from_uri(cls, uri: str, index_col=False, source: Optional[DataObject] = None, **kwargs) -> "CSVFile": resource = odo.resource(uri) if hasattr(resource, "dialect"): kwargs.update(resource.dialect) methods = [ lambda: pd.read_csv(uri, index_col=index_col, **kwargs), lambda: pd.read_csv(uri, index_col=index_col, engine="python", sep=None, **kwargs), lambda: odo.odo(uri, pd.DataFrame, index_col=index_col, **kwargs), lambda: cls._fallback_read(uri, **kwargs), ] result = None for method in methods: try: data = method() result = cls(inner_data=data, uri=uri, source=source, **kwargs) break except: pass if result: if not result.name: import os result.inner_data.name = os.path.splitext(os.path.basename(uri))[0] return result raise RuntimeError( "No CSV reading method understands the file: {0}".format(uri) )
def test_fixed_shape(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://'+fn+'::/foo') assert isinstance(r.shape, list) assert discover(r).shape == (len(df),) r.parent.close()
def test_s3_to_sqlite(): with tmpfile('.db') as fn: tb = into('sqlite:///%s::tips' % fn, tips_uri, dshape=discover(resource(tips_uri))) lhs = into(list, tb) assert lhs == into(list, tips_uri)
def test_no_header_no_columns(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_join_count(): ds = datashape.dshape( '{t1: var * {x: int, y: int}, t2: var * {a: int, b: int}}') engine = resource('sqlite:///:memory:', dshape=ds) db = symbol('db', ds) expr = join(db.t1[db.t1.x > -1], db.t2, 'x', 'a').count() result = compute(expr, {db: engine}, post_compute=False) expected1 = """ SELECT count(alias.x) as count FROM (SELECT t1.x AS x, t1.y AS y, t2.b AS b FROM t1 JOIN t2 ON t1.x = t2.a WHERE t1.x > ?) as alias """ expected2 = """ SELECT count(alias2.x) AS count FROM (SELECT alias1.x AS x, alias1.y AS y, t2.b AS b FROM (SELECT t1.x AS x, t1.y AS y FROM t1 WHERE t1.x > ?) AS alias1 JOIN t2 ON alias1.x = t2.a) AS alias2""" assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_s3_jsonlines_discover(): json_dshape = discover(resource('s3://nyqpug/tips.json')) names = list(map(str, sorted(json_dshape.measure.names))) assert names == ['day', 'sex', 'size', 'smoker', 'time', 'tip', 'total_bill'] types = [json_dshape.measure[name] for name in names] assert types == [string, string, int64, string, string, float64, float64]
def sql(url): try: t = resource(url, dshape='var * {a: int32, b: int32}') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: yield t drop(t)
def csv_to_db(csv_file, table_name, db_con='subway.db'): dshape = discover(resource(csv_file)) uri = 'sqlite:///' + db_con + '::' + table_name try: odo(csv_file, uri, dshape=dshape) print "Data loaded" except: print "Problems loading data"
def fsql(engine, fcsv, name): try: t = resource('%s::%s' % (url, name), dshape=discover(fcsv)) except sqlalchemy.exc.OperationalError as e: pytest.skip(str(e)) else: yield t drop(t)
def test_numeric_append(): tbl = resource('sqlite:///:memory:::test', dshape='var * {a: decimal[11, 2], b: ?decimal[10, 6]}') data = [(1.0, 2.0), (2.0, 3.0)] tbl = odo(data, tbl) assert odo(tbl, list) == list( map(lambda row: tuple(map(Decimal, row)), tbl.select().execute().fetchall()))