def dshape(self): measure = Record(list(zip(self.names, [v._dtype for v in self.values]))) if self.keepdims: return DataShape(*((1,) * self._child.ndim + (measure,))) else: return DataShape(measure)
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data, InteractiveSymbol): return Data(data.data, dshape, name, fields, columns, schema, **kwargs) if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) return InteractiveSymbol(data, ds, name)
def __init__(self, data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) self.dshape = datashape.dshape(dshape) self.data = data if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and self.schema != data.schema): raise TypeError('%s schema %s does not match %s schema %s' % (type(data).__name__, data.schema, type(self).__name__, self.schema)) self._name = name or next(names)
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string, ) return DataShape(*args) return dshape else: records = list( record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records), ) return DataShape(*args)
def list_to_dynd(L, **kwargs): ds = kwargs['dshape'] if isinstance(ds.measure, Tuple): measure = Record([['f%d' % i, typ] for i, typ in enumerate(ds.measure.parameters[0])]) ds = DataShape(*(ds.shape + (measure, ))) return nd.array(L, dtype=str(ds))
def compute_up(expr, data, **kwargs): leaf = expr._leaves()[0] chunk = symbol( 'chunk', DataShape(*(tuple(map(first, data.chunks)) + (leaf.dshape.measure, )))) (chunk, chunk_expr), (agg, agg_expr) = split(expr._child, expr, chunk=chunk) inds = tuple(range(ndim(leaf))) dtype = expr.dshape.measure.to_numpy_dtype() tmp = atop( curry(compute_it, chunk_expr, [chunk], **kwargs), inds, data, inds, dtype=dtype, ) return atop( compose( curry(compute_it, agg_expr, [agg], **kwargs), curry(_concatenate2, axes=expr.axis), ), tuple(i for i in inds if i not in expr.axis), tmp, inds, dtype=dtype, )
def _schema(self): schema = self._child.schema[0] if isinstance(schema, Record) and len(schema.types) == 1: result = toolz.first(schema.types) else: result = schema return DataShape(result)
def __init__(self, name, dshape): self._name = name if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(dshape, Mono) and not isinstance(dshape, DataShape): dshape = DataShape(dshape) self.dshape = dshape
def dshape(self): shape = self._child.dshape.shape schema = self._child.dshape.measure.dict[self._name] shape = shape + schema.shape schema = (schema.measure, ) return DataShape(*(shape + schema))
def discover(metadata): try: metadata.reflect(views=metadata.bind.dialect.supports_views) except NotImplementedError: metadata.reflect() pairs = [] for table in sorted(metadata.tables.values(), key=attrgetter('name')): name = table.name try: pairs.append([name, discover(table)]) except sa.exc.CompileError as e: warnings.warn( "Can not discover type of table {name}.\n" "SQLAlchemy provided this error message:\n\t{msg}" "\nSkipping.".format( name=name, msg=e.message, ), stacklevel=3, ) except NotImplementedError as e: warnings.warn( "Odo does not understand a SQLAlchemy type.\n" "Odo provided the following error:\n\t{msg}" "\nSkipping.".format(msg="\n\t".join(e.args)), stacklevel=3, ) return DataShape(Record(pairs))
def _dshape(self): ''' since pandas supports concat for string columns, do the same for blaze ''' shape = self.lhs.dshape.shape if isinstance(self.lhs.schema.measure, Option): schema = self.lhs.schema elif isinstance(self.rhs.schema.measure, Option): schema = self.rhs.schema else: _, lhs_encoding = self.lhs.schema.measure.parameters _, rhs_encoding = self.rhs.schema.measure.parameters assert lhs_encoding == rhs_encoding # convert fixed length string to variable length string schema = DataShape(String(None, lhs_encoding)) return DataShape(*(shape + (schema, )))
def schema(self): subs = dict(self.labels) d = self._child.dshape.measure.dict return DataShape( Record([[subs.get(name, name), dtype] for name, dtype in self._child.dshape.measure.parameters[0] ]))
def dshape(self): axis = self.axis ldshape = self.lhs.dshape lshape = ldshape.shape return DataShape( *(lshape[:axis] + (_shape_add(lshape[axis], self.rhs.dshape.shape[axis]), ) + lshape[axis + 1:] + (ldshape.measure, )))
def _schema(self): measure = self._child.schema.measure base = getattr(measure, 'ty', measure) return_type = Option if isinstance(measure, Option) else toolz.identity return DataShape( return_type( base if isinstance(base, Decimal) else base if isinstance(base, TimeDelta) else ct.float64, ))
def date_to_datetime_dshape(ds): shape = ds.shape if isinstance(ds.measure, Record): measure = Record([[name, ct.datetime_ if typ == ct.date_ else typ] for name, typ in ds.measure.parameters[0]]) else: measure = ds.measure return DataShape(*(shape + (measure, )))
def sql_table(table, colnames, measures, conn): """ Create a new blaze Array from an SQL table description. This returns a Record array. """ dtype = Record(list(zip(colnames, measures))) record_dshape = DataShape(coretypes.Var(), dtype) table = TableSelection(table, '*') return Array(SQLDataDescriptor(record_dshape, table, conn))
def _dshape(self): axis = self.axis if self.keepdims: shape = tuple(1 if i in axis else d for i, d in enumerate(self._child.shape)) else: shape = tuple(d for i, d in enumerate(self._child.shape) if i not in axis) return DataShape(*(shape + (self.schema, )))
def _dshape(self): shape = self._child.dshape.shape measure = self._child.dshape.measure # TODO: is this too special-case-y? schema = getattr(measure, 'value', measure).dict[self._name] shape = shape + schema.shape schema = (schema.measure,) return DataShape(*(shape + schema))
def fsql(engine, fcsv, name): dshape = discover(fcsv) dshape = DataShape( var, Record([(n, typ) for n, typ in zip('ab', dshape.measure.types)])) try: t = resource('%s::%s' % (url, name), dshape=dshape) except sqlalchemy.exc.OperationalError as e: pytest.skip(str(e)) else: yield t drop(t)
def _dshape(self): axis = self.axis if self.keepdims: shape = tuple(1 if i in axis else d for i, d in enumerate(self._child.shape)) else: shape = tuple(d for i, d in enumerate(self._child.shape) if i not in axis) measure = Record(list(zip(self.names, [v.schema for v in self.values]))) return DataShape(*(shape + (measure, )))
def dshape(self): # Compute shape shape = tuple([d for i, d in enumerate(self.lhs.shape) if i not in self._left_axes] + [d for i, d in enumerate(self.rhs.shape) if i not in self._right_axes]) # Compute measure by mimicking a mul and add l = symbol('l', self.lhs.dshape.measure) r = symbol('r', self.rhs.dshape.measure) measure = ((l * r) + (l * r)).dshape.measure return DataShape(*(shape + (measure,)))
def discover(metadata): metadata.reflect(views=metadata.bind.dialect.supports_views) pairs = [] for name, table in sorted(metadata.tables.items(), key=first): try: pairs.append([name, discover(table)]) except sa.exc.CompileError as e: print("Can not discover type of table %s.\n" % name + "SQLAlchemy provided this error message:\n\t%s" % e.message + "\nSkipping.") except NotImplementedError as e: print("Blaze does not understand a SQLAlchemy type.\n" "Blaze provided the following error:\n\t%s" % e.message + "\nSkipping.") return DataShape(Record(pairs))
def column_dshape(dshape, colname): """ Given a record dshape, project a column out """ rec = dshape.measure if not isinstance(rec, Record): raise TypeError("Can only select fields from record type") if colname not in rec.fields: raise ValueError("No such field %r" % (colname, )) measure = rec.fields[colname] params = list(dshape.shape) + [measure] dshape = DataShape(*params) return dshape
def compute_down(expr, data, **kwargs): """ Compute expressions on H5Py datasets by operating on chunks This uses blaze.expr.split to break a full-array-computation into a per-chunk computation and a on-aggregate computation. This uses blaze.partition to pick out chunks from the h5py dataset, uses compute(numpy) to compute on each chunk and then uses blaze.partition to aggregate these (hopefully smaller) intermediate results into a local numpy array. It then performs a second operation (again given by blaze.expr.split) on this intermediate aggregate The expression must contain some sort of Reduction. Both the intermediate result and the final result are assumed to fit into memory """ leaf = expr._leaves()[0] if not any(isinstance(node, Reduction) for node in path(expr, leaf)): raise MDNotImplementedError() # Compute chunksize (this should be improved) chunksize = kwargs.get('chunksize', data.chunks) # Split expression into per-chunk and on-aggregate pieces chunk = Symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure, )))) (chunk, chunk_expr), (agg, agg_expr) = \ split(leaf, expr, chunk=chunk) # Create numpy array to hold intermediate aggregate shape, dtype = to_numpy(agg.dshape) intermediate = np.empty(shape=shape, dtype=dtype) # Compute partitions data_partitions = partitions(data, chunksize=chunksize) int_partitions = partitions(intermediate, chunksize=chunk_expr.shape) # For each partition, compute chunk->chunk_expr # Insert into intermediate # This could be parallelized for d, i in zip(data_partitions, int_partitions): chunk_data = partition_get(data, d, chunksize=chunksize) result = compute(chunk_expr, {chunk: chunk_data}) partition_set(intermediate, i, result, chunksize=chunk_expr.shape) # Compute on the aggregate return compute(agg_expr, {agg: intermediate})
def compute_down(expr, data, map=None, **kwargs): """ Compute expressions on H5Py datasets by operating on chunks This uses blaze.expr.split to break a full-array-computation into a per-chunk computation and a on-aggregate computation. This uses blaze.partition to pick out chunks from the h5py dataset, uses compute(numpy) to compute on each chunk and then uses blaze.partition to aggregate these (hopefully smaller) intermediate results into a local numpy array. It then performs a second operation (again given by blaze.expr.split) on this intermediate aggregate The expression must contain some sort of Reduction. Both the intermediate result and the final result are assumed to fit into memory """ map = _get_map(map) leaf = expr._leaves()[0] if not any(isinstance(node, Reduction) for node in path(expr, leaf)): raise MDNotImplementedError() # Compute chunksize (this should be improved) chunksize = kwargs.get('chunksize', data.chunks) # Split expression into per-chunk and on-aggregate pieces chunk = symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure,)))) (chunk, chunk_expr), (agg, agg_expr) = \ split(leaf, expr, chunk=chunk) # Create numpy array to hold intermediate aggregate shape, dtype = to_numpy(agg.dshape) intermediate = np.empty(shape=shape, dtype=dtype) # Compute partitions source_parts = list(partitions(data, chunksize=chunksize, keepdims=True)) target_parts = list(partitions(intermediate, chunksize=chunk_expr.shape, keepdims=True)) list(map( curry(compute_chunk, data, intermediate, chunk, chunk_expr), zip(source_parts, target_parts) )) # Compute on the aggregate return compute(agg_expr, {agg: intermediate}, return_type='native')
def dynd_arr(self): # TODO: This should really use blz if self._dynd_result is not None: return self._dynd_result # Allocate empty dynd array length = sum(len(chunk) for chunk in self.query_result) ds = DataShape(length, self.dshape.measure) result = nd.empty(str(ds)) # Fill dynd array with chunks offset = 0 for chunk in self.query_result: result[offset:offset + len(chunk)] = chunk offset += len(chunk) self._dynd_result = result return result
def dynd_chunk_iterator(result, chunk_size=1024): """ Turn a query Result into a bunch of DyND arrays """ cursor = result.cursor chunk_size = max(cursor.arraysize, chunk_size) while True: try: results = cursor.fetchmany(chunk_size) except db.Error: break if not results: break dshape = DataShape(len(results), result.dshape.measure) chunk = nd.empty(str(dshape)) chunk[:] = list(iter_result(results, dshape)) yield chunk
def coalesce(a, b): a_dshape = discover(a) a_measure = a_dshape.measure isoption = isinstance(a_measure, Option) if isoption: a_measure = a_measure.ty isnull = isinstance(a_measure, Null) if isnull: # a is always null, this is just b return b if not isoption: # a is not an option, this is just a return a b_dshape = discover(b) return Coalesce(a, b, DataShape(*( maxshape((a_dshape.shape, b_dshape.shape)) + (promote(a_measure, b_dshape.measure),) )))
def sql_table(table_name, colnames, measures, conn): """ Create a new blaze Array from an SQL table description. This returns a Record array. Parameters ========== table_name: str table name colnames: [str] column names measures: [DataShape] measure (element type) for each column conn: pyodbc/whatever Connection """ dtype = Record(list(zip(colnames, measures))) record_dshape = DataShape(coretypes.Var(), dtype) table = TableSelection(table_name, '*') return Array(SQL_DDesc(record_dshape, table, conn))