def append_iterator_to_table(t, rows, dshape=None, **kwargs): assert not isinstance(t, type) rows = iter(rows) # We see if the sequence is of tuples or dicts # If tuples then we coerce them to dicts try: row = next(rows) except StopIteration: return rows = chain([row], rows) if isinstance(row, (tuple, list)): if dshape and isinstance(dshape.measure, datashape.Record): names = dshape.measure.names if set(names) != set(discover(t).measure.names): raise ValueError("Column names of incoming data don't match " "column names of existing SQL table\n" "Names in SQL table: %s\n" "Names from incoming data: %s\n" % (discover(t).measure.names, names)) else: names = discover(t).measure.names rows = (dict(zip(names, row)) for row in rows) engine = t.bind with engine.connect() as conn: for chunk in partition_all(1000, rows): # TODO: 1000 is hardcoded conn.execute(t.insert(), chunk) return t
def test_discover_numeric_column(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table('name', metadata, sa.Column('name', sa.types.NUMERIC),) assert discover(s)
def _dtype(self): # we can't simply use .schema or .datashape because we may have a bare # integer, for example lhs, rhs = discover(self.lhs).measure, discover(self.rhs).measure if isinstance(lhs, Option) or isinstance(rhs, Option): return Option(ct.bool_) return ct.bool_
def _dtype(self): lmeasure = discover(self.lhs).measure rmeasure = discover(self.rhs).measure if not (isinstance(getattr(lmeasure, 'ty', lmeasure), String)): raise TypeError('can only interp strings got: %s' % lmeasure) return optionify(lmeasure, rmeasure, lmeasure)
def discover_sqlalchemy_selectable(t): ordering = dict((c, i) for i, c in enumerate(c for c in t.columns.keys())) records = list(sum([discover(c).parameters[0] for c in t.columns], ())) fkeys = [discover(fkey, t, parent_measure=Record(records)) for fkey in t.foreign_keys] for name, column in merge(*fkeys).items(): records[ordering[name]] = (name, column) return var * Record(records)
def test_categorical_pandas(): df = pd.DataFrame({'x': list('a'*5 + 'b'*5 + 'c'*5), 'y': range(15)}, columns=['x', 'y']) df.x = df.x.astype('category') assert_dshape_equal(discover(df), 15 * Record([('x', Categorical(['a', 'b', 'c'])), ('y', int64)])) assert_dshape_equal(discover(df.x), 15 * Categorical(['a', 'b', 'c']))
def test_datetimetz_pandas(): df = pd.DataFrame( OrderedDict([ ('naive', pd.date_range('2014', periods=5)), ('Europe/Moscow', pd.date_range('2014', periods=5, tz='Europe/Moscow')), ('UTC', pd.date_range('2014', periods=5, tz='UTC')), ('US/Eastern', pd.date_range('2014', periods=5, tz='US/Eastern')), ]) ) assert_dshape_equal( discover(df), 5 * Record[ 'naive': Option(DateTime(tz=None)), 'Europe/Moscow': Option(DateTime(tz='Europe/Moscow')), 'UTC': Option(DateTime(tz='UTC')), 'US/Eastern': Option(DateTime(tz='US/Eastern')), ] ) assert_dshape_equal(discover(df.naive), 5 * Option(DateTime(tz=None))) for tz in ('Europe/Moscow', 'UTC', 'US/Eastern'): assert_dshape_equal( discover(df[tz]), 5 * Option(DateTime(tz=tz)) )
def _dtype(self): lhs, rhs = discover(self.lhs).measure, discover(self.rhs).measure is_unsigned = lhs in unsigned and rhs in unsigned max_width = max(lhs.itemsize, rhs.itemsize) prefix = 'u' if is_unsigned else '' measure = getattr(ct, '%sint%d' % (prefix, max_width * 8)) return optionify(lhs, rhs, measure)
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError('data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def test_compute_up_on_dict(): d = {'a': [1, 2, 3], 'b': [4, 5, 6]} assert str(discover(d)) == str(dshape('{a: 3 * int64, b: 3 * int64}')) s = symbol('s', discover(d)) assert compute(s.a, {s: d}) == [1, 2, 3]
def test_concat_arr(): s_data = np.arange(15) t_data = np.arange(15, 30) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) assert (compute(concat(s, t), {s: s_data, t: t_data}) == np.arange(30)).all()
def test_binary_math(funcname): s_data = np.arange(15).reshape(5, 3) t_data = np.arange(15, 30).reshape(5, 3) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) scope = {s: s_data, t: t_data} result = compute(getattr(blaze, funcname)(s, t), scope) expected = getattr(np, binary_name_map.get(funcname, funcname))(s_data, t_data) assert np.all(result == expected)
def test_floating_binary_math(func, kwargs): s_data = np.arange(15).reshape(5, 3) t_data = np.arange(15, 30).reshape(5, 3) s = symbol('s', discover(s_data)) t = symbol('t', discover(t_data)) scope = {s: s_data, t: t_data} result = compute(getattr(blaze, func)(s, t), scope, **kwargs) expected = getattr(np, binary_name_map.get(func, func))(s_data, t_data) np.testing.assert_allclose(result, expected)
def test_concat_mat(): s_data = np.arange(15).reshape(5, 3) t_data = np.arange(15, 30).reshape(5, 3) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) assert (compute(concat(s, t), {s: s_data, t: t_data}) == np.arange(30).reshape(10, 3)).all() assert (compute(concat(s, t, axis=1), {s: s_data, t: t_data}) == np.concatenate((s_data, t_data), axis=1)).all()
def test_least_mixed(dtype): s_data = np.array([2, 1], dtype=dtype) t_data = np.array([1, 2], dtype=dtype) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) expr = least(s, t) result = compute(expr, {s: s_data, t: t_data}) expected = np.minimum(s_data, t_data) assert np.all(result == expected)
def test_least(dtype): s_data = np.arange(15, dtype=dtype).reshape(5, 3) t_data = np.arange(15, 30, dtype=dtype).reshape(5, 3) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) expr = least(s, t) result = compute(expr, {s: s_data, t: t_data}) expected = np.minimum(s_data, t_data) assert np.all(result == expected)
def test_datetimes_persist(): typs = [list, tuple, np.ndarray, tuple] L = [datetime.datetime.now()] * 3 ds = discover(L) x = L for cls in typs: x = convert(cls, x) assert discover(x) == ds
def discover_jsonlines(j, n=10, encoding='utf-8', **kwargs): with json_lines(j.path, encoding=encoding) as lines: data = pipe(lines, filter(nonempty), map(json.loads), take(n), list) if len(data) < n: ds = discover(data) else: ds = var * discover(data).subshape[0] return date_to_datetime_dshape(ds)
def test_greatest(dtype): s_data = np.arange(15, dtype=dtype).reshape(5, 3) t_data = np.arange(15, 30, dtype=dtype).reshape(5, 3) s = symbol('s', discover(s_data)) t = symbol('t', discover(t_data)) expr = greatest(s, t) result = compute(expr, {s: s_data, t: t_data}) expected = np.maximum(s_data, t_data) assert np.all(result == expected)
def test_greatest_mixed(dtype): s_data = np.array([2, 1], dtype=dtype) t_data = np.array([1, 2], dtype=dtype) s = symbol('s', discover(s_data)) t = symbol('t', discover(t_data)) expr = greatest(s, t) result = compute(expr, {s: s_data, t: t_data}) expected = np.maximum(s_data, t_data) assert np.all(result == expected)
def test_discover(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table('accounts', metadata, sa.Column('name', sa.String), sa.Column('amount', sa.Integer), sa.Column('timestamp', sa.DateTime, primary_key=True)) assert discover(s) == \ dshape('var * {name: ?string, amount: ?int32, timestamp: datetime}')
def test_discover_views(): engine, t = single_table_engine() metadata = t.metadata with engine.connect() as conn: conn.execute('''CREATE VIEW myview AS SELECT name, amount FROM accounts WHERE amount > 0''') assert str(discover(metadata)) == str(discover({'accounts': t, 'myview': t}))
def test_discover(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table('accounts', metadata, sa.Column('name', sa.String), sa.Column('amount', sa.Integer), sa.Column('timestamp', sa.DateTime, primary_key=True)) ds = dshape('var * {name: ?string, amount: ?int32, timestamp: datetime}') assert_dshape_equal(discover(s), ds) for name in ds.measure.names: assert isinstance(name, string_types)
def test_compute_up_on_dict(): d = {'a': [1, 2, 3], 'b': [4, 5, 6]} assert_dshape_equal( discover(d), dshape('{a: 3 * int64, b: 3 * int64}').measure, check_record_order=False, # dict order undefined ) s = symbol('s', discover(d)) assert compute(s.a, {s: d}) == [1, 2, 3]
def _dtype(self): lmeasure = discover(self.lhs).measure rmeasure = discover(self.rhs).measure if not (isinstance(getattr(lmeasure, 'ty', lmeasure), String) and getattr(rmeasure, 'ty', rmeasure) in integral): raise TypeError( 'can only repeat strings by an integer amount, got: %s * %s' % (lmeasure, rmeasure), ) return optionify(lmeasure, rmeasure, lmeasure)
def test_discover(): assert discover(sa.String()) == datashape.string metadata = sa.MetaData() s = sa.Table( "accounts", metadata, sa.Column("name", sa.String), sa.Column("amount", sa.Integer), sa.Column("timestamp", sa.DateTime, primary_key=True), ) assert discover(s) == dshape("var * {name: ?string, amount: ?int32, timestamp: datetime}")
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = data('hdfstore://%s' % fn) assert dshape(discover(hdf)) == dshape(discover({'data': {'fixed': df}})) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.data.close()
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data, InteractiveSymbol): return Data(data.data, dshape, name, fields, columns, schema, **kwargs) if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError('data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) return InteractiveSymbol(data, ds, name)
def test_foreign_keys_as_compound_primary_key(): with tmpfile('db') as fn: suppliers = resource( 'sqlite:///%s::suppliers' % fn, dshape='var * {id: int64, name: string}', primary_key=['id'] ) parts = resource( 'sqlite:///%s::parts' % fn, dshape='var * {id: int64, name: string, region: string}', primary_key=['id'] ) suppart = resource( 'sqlite:///%s::suppart' % fn, dshape='var * {supp_id: map[int64, T], part_id: map[int64, U]}', foreign_keys={ 'supp_id': suppliers.c.id, 'part_id': parts.c.id }, primary_key=['supp_id', 'part_id'] ) expected = dshape(""" var * { supp_id: map[int64, {id: int64, name: string}], part_id: map[int64, {id: int64, name: string, region: string}] } """) result = discover(suppart) assert result == expected
def test_compound_primary_key_with_single_reference(): with tmpfile('db') as fn: products = resource('sqlite:///%s::products' % fn, dshape=""" var * { product_no: int32, product_sku: string, name: ?string, price: ?float64 } """, primary_key=['product_no', 'product_sku']) # TODO: should this fail everywhere? e.g., this fails in postgres, but # not in sqlite because postgres doesn't allow partial foreign keys # might be best to let the backend handle this ds = dshape("""var * { order_id: int32, product_no: map[int32, T], quantity: ?int32 }""") orders = resource('sqlite:///%s::orders' % fn, dshape=ds, foreign_keys=dict(product_no=products.c.product_no), primary_key=['order_id']) assert discover(orders) == dshape( """var * { order_id: int32, product_no: map[int32, {product_no: int32, product_sku: string, name: ?string, price: ?float64}], quantity: ?int32 } """ )
def test_discovery_engine(): engine, t = single_table_engine() assert discover(engine, 'accounts') == discover(t) assert str(discover(engine)) == str(discover({'accounts': t}))
def test_discover_selectable(): t = resource('sqlite:///:memory:::mytable', dshape='var * {x: int, y: int}') q = sa.select([t.c.x]).limit(5) assert discover(q) == dshape('var * {x: int}')
def discover_h5py_group_file(g): return DataShape(Record([[k, discover(v)] for k, v in g.items()]))
from datetime import datetime, date from blaze.compute.core import compute, compute_up from blaze.expr import symbol, by, exp, summary, Broadcast, join, concat from blaze.expr import greatest, least from blaze import sin import blaze from odo import into from datashape import discover, to_numpy, dshape x = np.array([(1, 'Alice', 100), (2, 'Bob', -200), (3, 'Charlie', 300), (4, 'Denis', 400), (5, 'Edith', -500)], dtype=[('id', 'i8'), ('name', 'S7'), ('amount', 'i8')]) t = symbol('t', discover(x)) def eq(a, b): c = a == b if isinstance(c, np.ndarray): return c.all() return c def test_symbol(): assert eq(compute(t, x), x) def test_eq(): assert eq(compute(t['amount'] == 100, x), x['amount'] == 100)
def test_query_with_strings(): b = np.array([('a', 1), ('b', 2), ('c', 3)], dtype=[('x', 'S1'), ('y', 'i4')]) s = symbol('s', discover(b)) assert compute(s[s.x == b'b'], b).tolist() == [(b'b', 2)]
def test_str_interp(): a = np.array(('%s', '%s', '%s')) s = symbol('s', discover(a)) expr = s.interp(1) assert all(compute(expr, a) == np.char.mod(a, 1))
def __init__(self, categories, type=None, ordered=False): self.categories = tuple(categories) self.type = (type or datashape.discover(self.categories)).measure self.ordered = ordered
def discover(engine): metadata = metadata_of_engine(engine) return discover(metadata)
def discover_sqlalchemy_selectable(t): records = list(sum([discover(c).parameters[0] for c in t.columns], ())) return var * Record(records)
def discover_sqlalchemy_column(col): optionify = Option if col.nullable else identity return Record([[col.name, optionify(discover(col.type))]])
def test_discover_null_columns(): assert dshape(discover(sa.Column('name', sa.String, nullable=True))) == \ dshape('{name: ?string}') assert dshape(discover(sa.Column('name', sa.String, nullable=False))) == \ dshape('{name: string}')
def test_discover_oracle_intervals(freq): typ = sa.dialects.oracle.base.INTERVAL(day_precision={'D': 9}.get(freq), second_precision=prec.get(freq, 0)) t = sa.Table('t', sa.MetaData(), sa.Column('dur', typ)) assert discover(t) == dshape('var * {dur: ?timedelta[unit="%s"]}' % freq)
def test_discover_postgres_intervals(freq): precision = prec.get(freq) typ = sa.dialects.postgresql.base.INTERVAL(precision=precision) t = sa.Table('t', sa.MetaData(), sa.Column('dur', typ)) assert discover(t) == dshape('var * {dur: ?timedelta[unit="%s"]}' % freq)
def discover_tables_node(f): return discover(f.getNode('/'))
def discover_tables_node(n): return discover(n._v_children) # subclasses dict
def discover_foreign_key_relationship(fk, parent, parent_measure=None): if fk.column.table is not parent: parent_measure = discover(fk.column.table).measure return {fk.parent.name: Map(discover(fk.parent.type), parent_measure)}
def test_coerce(): x = np.arange(1, 3) s = symbol('s', discover(x)) np.testing.assert_array_equal(compute(s.coerce('float64'), x), np.arange(1.0, 3.0))
def discover_sqlalchemy_column(c): meta = Option if getattr(c, 'nullable', True) else identity return Record([(c.name, meta(discover(c.type)))])
def test_str_repeat(): a = np.array(('a', 'b', 'c')) s = symbol('s', discover(a)) expr = s.repeat(3) assert all(compute(expr, a) == np.char.multiply(a, 3))
def discover(engine): return discover(metadata_of_engine(engine))
def test_subexpr_datetime(): data = pd.date_range(start='01/01/2010', end='01/04/2010', freq='D').values s = symbol('s', discover(data)) result = compute(s.truncate(days=2).day, data) expected = np.array([31, 2, 2, 4]) np.testing.assert_array_equal(result, expected)
def discover_postgresql_interval(t): return discover(sa.Interval(day_precision=0, second_precision=t.precision))
def dataset(): return str(discover(_get_data()))
def numpy_to_dynd(x, **kwargs): return nd.array(x, type=str(discover(x)))
def discover_oracle_interval(t): return discover(t.adapt(sa.Interval))
def test_discovery_metadata(): engine, t = single_table_engine() metadata = t.metadata assert str(discover(metadata)) == str(discover({'accounts': t}))
def _transform(graph, target, source, excluded_edges=None, ooc_types=ooc_types, **kwargs): """ Transform source to target type using graph of transformations """ # take a copy so we can mutate without affecting the input excluded_edges = (excluded_edges.copy() if excluded_edges is not None else set()) with ignoring(NotImplementedError): if 'dshape' not in kwargs or kwargs['dshape'] is None: kwargs['dshape'] = discover(source) pth = path(graph, type(source), target, excluded_edges=excluded_edges, ooc_types=ooc_types) x = source path_proxy = IterProxy(pth) for convert_from, convert_to, f, cost in path_proxy: try: x = f(x, excluded_edges=excluded_edges, **kwargs) except NotImplementedError as e: if kwargs.get('raise_on_errors'): raise warn(FailedConversionWarning(convert_from, convert_to, e)) # exclude the broken edge excluded_edges |= {(convert_from, convert_to)} # compute the path from `source` to `target` excluding # the edge that broke fresh_path = list( path(graph, type(source), target, excluded_edges=excluded_edges, ooc_types=ooc_types)) fresh_path_cost = path_cost(fresh_path) # compute the path from the current `convert_from` type # to the `target` try: greedy_path = list( path(graph, convert_from, target, excluded_edges=excluded_edges, ooc_types=ooc_types)) except nx.exception.NetworkXNoPath: greedy_path_cost = np.inf else: greedy_path_cost = path_cost(greedy_path) if fresh_path_cost < greedy_path_cost: # it is faster to start over from `source` with a new path x = source pth = fresh_path else: # it is faster to work around our broken edge from our # current location pth = greedy_path path_proxy.it = pth return x
def dataframe_to_numpy(df, dshape=None, **kwargs): dtype = dshape_to_numpy(dshape or discover(df)) x = df.to_records(index=False) if x.dtype != dtype: x = x.astype(dtype) return x
def test_timedelta_sql_discovery(freq): ds = '{name: string, amount: int, duration: timedelta[unit="%s"]}' % freq t = dshape_to_table('td_bank', ds) assert discover(t).measure['duration'] == datashape.TimeDelta(freq)
def test_discover_fixed_length_string(): t = resource('sqlite:///:memory:::mytable', dshape='var * {x: string[30]}') assert discover(t) == dshape('var * {x: string[30]}')