def ones(dshape, caps={'efficient-write': True}, storage=None): """Create an array and fill it with ones. Parameters ---------- dshape : datashape The datashape for the resulting array. caps : capabilities dictionary A dictionary containing the desired capabilities of the array. storage : Storage instance A Storage object with the necessary info for data storage. Returns ------- out: a concrete blaze array. """ dshape = _normalize_dshape(dshape) storage = _storage_convert(storage) if storage is not None: shape, dt = to_numpy(dshape) dd = BLZDataDescriptor(blz.ones(shape, dt, rootdir=storage.path)) elif 'efficient-write' in caps: # TODO: Handle var dimension properly (raise exception?) dyndarr = nd.empty(str(dshape)) dyndarr[...] = True dd = DyNDDataDescriptor(dyndarr) elif 'compress' in caps: shape, dt = to_numpy(dshape) dd = BLZDataDescriptor(blz.ones(shape, dt)) return Array(dd)
def dataset_from_dshape(file, datapath, ds, **kwargs): dtype = varlen_dtype(to_numpy_dtype(ds)) if datashape.var not in list(ds): shape = to_numpy(ds)[0] elif datashape.var not in list(ds)[1:]: shape = (0,) + to_numpy(ds.subshape[0])[0] else: raise ValueError("Don't know how to handle varlen nd shapes") if shape: kwargs['chunks'] = kwargs.get('chunks', True) kwargs['maxshape'] = kwargs.get('maxshape', (None,) + shape[1:]) kwargs2 = keyfilter(h5py_attributes.__contains__, kwargs) return file.require_dataset(datapath, shape=shape, dtype=dtype, **kwargs2)
def ones(dshape, ddesc=None): """Create an array and fill it with ones. Parameters ---------- dshape : datashape The datashape for the resulting array. ddesc : data descriptor instance This comes with the necessary info for storing the data. If None, a DyND_DDesc will be used. Returns ------- out: a concrete blaze array. """ dshape = _normalize_dshape(dshape) if ddesc is None: ddesc = DyND_DDesc(nd.ones(str(dshape), access='rw')) return Array(ddesc) if isinstance(ddesc, BLZ_DDesc): shape, dt = to_numpy(dshape) ddesc.blzarr = blz.ones( shape, dt, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) elif isinstance(ddesc, HDF5_DDesc): obj = nd.as_numpy(nd.empty(str(dshape))) with tb.open_file(ddesc.path, mode=ddesc.mode) as f: where, name = split_path(ddesc.datapath) f.create_earray(where, name, filters=ddesc.filters, obj=obj) ddesc.mode = 'a' # change into 'a'ppend mode for further operations return Array(ddesc)
def empty(dshape, caps={'efficient-write': True}, storage=None): """Create an array with uninitialized data. Parameters ---------- dshape : datashape The datashape for the resulting array. caps : capabilities dictionary A dictionary containing the desired capabilities of the array. storage : Storage instance A Storage object with the necessary info for data storage. Returns ------- out : a concrete blaze array. """ dshape = _normalize_dshape(dshape) storage = _storage_convert(storage) if storage is not None: shape, dt = to_numpy(dshape) dd = BLZDataDescriptor(blz.zeros(shape, dt, rootdir=storage.path)) elif 'efficient-write' in caps: dd = DyNDDataDescriptor(nd.empty(str(dshape))) elif 'compress' in caps: dd = BLZDataDescriptor(blz.zeros(shape, dt)) return Array(dd)
def dataset_from_dshape(file, datapath, ds, **kwargs): dtype = varlen_dtype(to_numpy_dtype(ds)) if datashape.var not in list(ds): shape = to_numpy(ds)[0] elif datashape.var not in list(ds)[1:]: shape = (0, ) + to_numpy(ds.subshape[0])[0] else: raise ValueError("Don't know how to handle varlen nd shapes") if shape: kwargs['chunks'] = kwargs.get('chunks', True) kwargs['maxshape'] = kwargs.get('maxshape', (None, ) + shape[1:]) kwargs2 = keyfilter(h5py_attributes.__contains__, kwargs) return file.require_dataset(datapath, shape=shape, dtype=dtype, **kwargs2)
def interpret(func, env, args, storage=None, **kwds): assert len(args) == len(func.args) # Make a copy, since we're going to mutate our IR! func, _ = copy_function(func) # If it's a BLZ output, we want an interpreter that streams # the processing through in chunks if storage is not None: if len(func.type.restype.shape) == 0: raise TypeError('Require an array, not a scalar, for outputting to BLZ') env['stream-outer'] = True result_ndim = env['result-ndim'] = len(func.type.restype.shape) else: # Convert any persistent inputs to memory # TODO: should stream the computation in this case for i, arg in enumerate(args): if isinstance(arg._data, BLZDataDescriptor): args[i] = arg[:] # Update environment with dynd type information dynd_types = dict((arg, get_dynd_type(array)) for arg, array in zip(func.args, args) if isinstance(array._data, DyNDDataDescriptor)) env['dynd-types'] = dynd_types # Lift ckernels func, env = run_pipeline(func, env, run_time_passes) if storage is None: # Evaluate once values = dict(zip(func.args, args)) interp = CKernelInterp(values) visit(interp, func) return interp.result else: res_shape, res_dt = datashape.to_numpy(func.type.restype) dim_size = operator.index(res_shape[0]) row_size = ndt.type(str(func.type.restype.subarray(1))).data_size chunk_size = min(max(1, (1024*1024) // row_size), dim_size) # Evaluate by streaming the outermost dimension, # and using the BLZ data descriptor's append dst_dd = BLZDataDescriptor(blz.zeros((0,)+res_shape[1:], res_dt, rootdir=storage.path)) # Loop through all the chunks for chunk_start in range(0, dim_size, chunk_size): # Tell the interpreter which chunk size to use (last # chunk might be smaller) chunk_size = min(chunk_size, dim_size - chunk_start) # Evaluate the chunk args_chunk = [arg[chunk_start:chunk_start+chunk_size] if len(arg.dshape.shape) == result_ndim else arg for arg in args] values = dict(zip(func.args, args_chunk)) interp = CKernelChunkInterp(values, chunk_size, result_ndim) visit(interp, func) chunk = interp.result._data.dynd_arr() dst_dd.append(chunk) return blaze.Array(dst_dd)
def compute_up(expr, data, **kwargs): shape, dtype = to_numpy(expr.dshape) if shape: result = np.empty(shape=shape, dtype=dtype) for n, v in zip(expr.names, expr.values): result[n] = compute(axify(v, expr.axis, expr.keepdims), data) return result else: return tuple(compute(axify(v, expr.axis), data) for v in expr.values)
def test_summary_on_ndarray_with_axis(): for axis in [0, 1, (1, 0)]: expr = summary(total=a.sum(), min=a.min(), axis=axis) result = compute(expr, ax) shape, dtype = to_numpy(expr.dshape) expected = np.empty(shape=shape, dtype=dtype) expected['total'] = ax.sum(axis=axis) expected['min'] = ax.min(axis=axis) assert eq(result, expected)
def append(self, values): """Append a list of values.""" shape, dtype = datashape.to_numpy(self.dshape) values_arr = np.array(values, dtype=dtype) shape_vals = values_arr.shape if len(shape_vals) < len(shape): shape_vals = (1, ) + shape_vals if len(shape_vals) != len(shape): raise ValueError("shape of values is not compatible") # Now, do the actual append self.blzarr.append(values_arr.reshape(shape_vals)) self.blzarr.flush()
def append(self, values): """Append a list of values.""" shape, dtype = datashape.to_numpy(self.dshape) values_arr = np.array(values, dtype=dtype) shape_vals = values_arr.shape if len(shape_vals) < len(shape): shape_vals = (1,) + shape_vals if len(shape_vals) != len(shape): raise ValueError("shape of values is not compatible") # Now, do the actual append self.blzarr.append(values_arr.reshape(shape_vals)) self.blzarr.flush()
def append(self, values): """Append a list of values.""" shape, dtype = datashape.to_numpy(self.dshape) values_arr = np.array(values, dtype=dtype) shape_vals = values_arr.shape if len(shape_vals) < len(shape): shape_vals = (1, ) + shape_vals if len(shape_vals) != len(shape): raise ValueError("shape of values is not compatible") # Now, do the actual append with tb.open_file(self.filename, mode='a') as f: h5arr = f.get_node(f.root, self.datapath) h5arr.append(values_arr.reshape(shape_vals))
def append(self, values): """Append a list of values.""" shape, dtype = datashape.to_numpy(self.dshape) values_arr = np.array(values, dtype=dtype) shape_vals = values_arr.shape if len(shape_vals) < len(shape): shape_vals = (1,) + shape_vals if len(shape_vals) != len(shape): raise ValueError("shape of values is not compatible") # Now, do the actual append with tb.open_file(self.path, mode=self.mode) as f: dset = f.get_node(self.datapath) dset.append(values_arr.reshape(shape_vals))
def compute_down(expr, data, **kwargs): """ Compute expressions on H5Py datasets by operating on chunks This uses blaze.expr.split to break a full-array-computation into a per-chunk computation and a on-aggregate computation. This uses blaze.partition to pick out chunks from the h5py dataset, uses compute(numpy) to compute on each chunk and then uses blaze.partition to aggregate these (hopefully smaller) intermediate results into a local numpy array. It then performs a second operation (again given by blaze.expr.split) on this intermediate aggregate The expression must contain some sort of Reduction. Both the intermediate result and the final result are assumed to fit into memory """ leaf = expr._leaves()[0] if not any(isinstance(node, Reduction) for node in path(expr, leaf)): raise MDNotImplementedError() # Compute chunksize (this should be improved) chunksize = kwargs.get('chunksize', data.chunks) # Split expression into per-chunk and on-aggregate pieces chunk = Symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure,)))) (chunk, chunk_expr), (agg, agg_expr) = \ split(leaf, expr, chunk=chunk) # Create numpy array to hold intermediate aggregate shape, dtype = to_numpy(agg.dshape) intermediate = np.empty(shape=shape, dtype=dtype) # Compute partitions data_partitions = partitions(data, chunksize=chunksize) int_partitions = partitions(intermediate, chunksize=chunk_expr.shape) # For each partition, compute chunk->chunk_expr # Insert into intermediate # This could be parallelized for d, i in zip(data_partitions, int_partitions): chunk_data = partition_get(data, d, chunksize=chunksize) result = compute(chunk_expr, {chunk: chunk_data}) partition_set(intermediate, i, result, chunksize=chunk_expr.shape) # Compute on the aggregate return compute(agg_expr, {agg: intermediate})
def compute_down(expr, data, **kwargs): """ Compute expressions on H5Py datasets by operating on chunks This uses blaze.expr.split to break a full-array-computation into a per-chunk computation and a on-aggregate computation. This uses blaze.partition to pick out chunks from the h5py dataset, uses compute(numpy) to compute on each chunk and then uses blaze.partition to aggregate these (hopefully smaller) intermediate results into a local numpy array. It then performs a second operation (again given by blaze.expr.split) on this intermediate aggregate The expression must contain some sort of Reduction. Both the intermediate result and the final result are assumed to fit into memory """ leaf = expr._leaves()[0] if not any(isinstance(node, Reduction) for node in path(expr, leaf)): raise MDNotImplementedError() # Compute chunksize (this should be improved) chunksize = kwargs.get('chunksize', data.chunks) # Split expression into per-chunk and on-aggregate pieces chunk = Symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure, )))) (chunk, chunk_expr), (agg, agg_expr) = \ split(leaf, expr, chunk=chunk) # Create numpy array to hold intermediate aggregate shape, dtype = to_numpy(agg.dshape) intermediate = np.empty(shape=shape, dtype=dtype) # Compute partitions data_partitions = partitions(data, chunksize=chunksize) int_partitions = partitions(intermediate, chunksize=chunk_expr.shape) # For each partition, compute chunk->chunk_expr # Insert into intermediate # This could be parallelized for d, i in zip(data_partitions, int_partitions): chunk_data = partition_get(data, d, chunksize=chunksize) result = compute(chunk_expr, {chunk: chunk_data}) partition_set(intermediate, i, result, chunksize=chunk_expr.shape) # Compute on the aggregate return compute(agg_expr, {agg: intermediate})
def compute_down(expr, data, map=None, **kwargs): """ Compute expressions on H5Py datasets by operating on chunks This uses blaze.expr.split to break a full-array-computation into a per-chunk computation and a on-aggregate computation. This uses blaze.partition to pick out chunks from the h5py dataset, uses compute(numpy) to compute on each chunk and then uses blaze.partition to aggregate these (hopefully smaller) intermediate results into a local numpy array. It then performs a second operation (again given by blaze.expr.split) on this intermediate aggregate The expression must contain some sort of Reduction. Both the intermediate result and the final result are assumed to fit into memory """ map = _get_map(map) leaf = expr._leaves()[0] if not any(isinstance(node, Reduction) for node in path(expr, leaf)): raise MDNotImplementedError() # Compute chunksize (this should be improved) chunksize = kwargs.get('chunksize', data.chunks) # Split expression into per-chunk and on-aggregate pieces chunk = symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure,)))) (chunk, chunk_expr), (agg, agg_expr) = \ split(leaf, expr, chunk=chunk) # Create numpy array to hold intermediate aggregate shape, dtype = to_numpy(agg.dshape) intermediate = np.empty(shape=shape, dtype=dtype) # Compute partitions source_parts = list(partitions(data, chunksize=chunksize, keepdims=True)) target_parts = list(partitions(intermediate, chunksize=chunk_expr.shape, keepdims=True)) list(map( curry(compute_chunk, data, intermediate, chunk, chunk_expr), zip(source_parts, target_parts) )) # Compute on the aggregate return compute(agg_expr, {agg: intermediate}, return_type='native')
def interpret(func, env, storage=None, **kwds): args = env['runtime.arglist'] if storage is None: # Evaluate once values = dict(zip(func.args, args)) interp = CKernelInterp(values) visit(interp, func) return interp.result else: result_ndim = env['result-ndim'] res_shape, res_dt = datashape.to_numpy(func.type.restype) dim_size = operator.index(res_shape[0]) row_size = ndt.type(str(func.type.restype.subarray(1))).data_size chunk_size = min(max(1, (1024 * 1024) // row_size), dim_size) # Evaluate by streaming the outermost dimension, # and using the BLZ data descriptor's append dst_dd = BLZDataDescriptor( blz.zeros((0, ) + res_shape[1:], res_dt, rootdir=storage.path)) # Loop through all the chunks for chunk_start in range(0, dim_size, chunk_size): # Tell the interpreter which chunk size to use (last # chunk might be smaller) chunk_size = min(chunk_size, dim_size - chunk_start) # Evaluate the chunk args_chunk = [ arg[chunk_start:chunk_start + chunk_size] if len(arg.dshape.shape) == result_ndim else arg for arg in args ] values = dict(zip(func.args, args_chunk)) interp = CKernelChunkInterp(values, chunk_size, result_ndim) visit(interp, func) chunk = interp.result._data.dynd_arr() dst_dd.append(chunk) return blaze.Array(dst_dd)
def interpret(func, env, ddesc=None, **kwds): args = env['runtime.arglist'] if ddesc is None: # Evaluate once values = dict(zip(func.args, args)) interp = CKernelInterp(values) visit(interp, func) return interp.result else: result_ndim = env['result-ndim'] res_shape, res_dt = datashape.to_numpy(func.type.restype) dim_size = operator.index(res_shape[0]) row_size = ndt.type(str(func.type.restype.subarray(1))).default_data_size chunk_size = min(max(1, (1024*1024) // row_size), dim_size) # Evaluate by streaming the outermost dimension, # and using the BLZ data descriptor's append ddesc.blzarr = blz.zeros((0,)+res_shape[1:], res_dt, rootdir=ddesc.path, mode=ddesc.mode) # Loop through all the chunks for chunk_start in range(0, dim_size, chunk_size): # Tell the interpreter which chunk size to use (last # chunk might be smaller) chunk_size = min(chunk_size, dim_size - chunk_start) # Evaluate the chunk args_chunk = [arg[chunk_start:chunk_start+chunk_size] if len(arg.dshape.shape) == result_ndim else arg for arg in args] values = dict(zip(func.args, args_chunk)) interp = CKernelChunkInterp(values, chunk_size, result_ndim) visit(interp, func) chunk = interp.result.ddesc.dynd_arr() ddesc.append(chunk) return blaze.Array(ddesc)
def ones(dshape, ddesc=None): """Create an array and fill it with ones. Parameters ---------- dshape : datashape The datashape for the resulting array. ddesc : data descriptor instance This comes with the necessary info for storing the data. If None, a DyND_DDesc will be used. Returns ------- out: a concrete blaze array. """ dshape = _normalize_dshape(dshape) if ddesc is None: ddesc = DyND_DDesc(nd.ones(str(dshape), access='rw')) return Array(ddesc) if isinstance(ddesc, BLZ_DDesc): shape, dt = to_numpy(dshape) ddesc.blzarr = blz.ones(shape, dt, rootdir=ddesc.path, mode=ddesc.mode, **ddesc.kwargs) elif isinstance(ddesc, HDF5_DDesc): obj = nd.as_numpy(nd.empty(str(dshape))) with tb.open_file(ddesc.path, mode=ddesc.mode) as f: where, name = split_path(ddesc.datapath) f.create_earray(where, name, filters=ddesc.filters, obj=obj) ddesc.mode = 'a' # change into 'a'ppend mode for further operations return Array(ddesc)
def create_np_ndarray(_, dshape=None, **kwargs): shape, dtype = datashape.to_numpy(dshape) return np.empty(shape=shape, dtype=dtype)
def test_to_numpy_fields(self): import numpy as np ds = datashape.dshape('{x: int32, y: float32}') shape, dt = datashape.to_numpy(ds) self.assertEqual(shape, ()) self.assertEqual(dt, np.dtype([('x', 'int32'), ('y', 'float32')]))
def _to_numpy(ds): res = to_numpy(ds) res = res if type(res) is tuple else ((), to_numpy_dtype(ds)) return res
def test_to_numpy_fails(): ds = var * int32 with pytest.raises(TypeError): to_numpy(ds) with pytest.raises(TypeError): to_numpy(Option(int32))
def _eval_blocks(expression, vars, vlen, rowsize, vm, **kwargs): """Perform the evaluation in blocks.""" # Compute the optimal block size (in elements) # The next is based on experiments, but YMMV if vm == "numexpr": # If numexpr, make sure that operands fit in L3 chache bsize = 2**20 # 1 MB is common for L3 else: # If python, make sure that operands fit in L2 chache bsize = 2**17 # 256 KB is common for L2 bsize //= rowsize # Evaluation seems more efficient if block size is a power of 2 bsize = 2 ** (int(math.log(bsize, 2))) if vlen < 100*1000: bsize //= 8 elif vlen < 1000*1000: bsize //= 4 elif vlen < 10*1000*1000: bsize //= 2 # Protection against too large rowsizes if bsize == 0: bsize = 1 vars_ = {} # Convert operands into Blaze arrays and get temporaries for vars maxndims = 0 for name in dict_viewkeys(vars): var = vars[name] if not hasattr(var, "dshape"): # Convert sequences into regular Blaze arrays vars[name] = var = array(var) if hasattr(var, "__len__"): ndims = len(var.dshape.shape) if ndims > maxndims: maxndims = ndims if len(var) > bsize: # Variable is too large; get a container for a chunk res_shape, res_dtype = datashape.to_numpy(var.dshape) res_shape = list(res_shape) res_shape[0] = bsize dshape = datashape.from_numpy(res_shape, res_dtype) vars_[name] = empty(dshape) if 'ddesc' in kwargs and kwargs['ddesc'] is not None: res_ddesc = True else: res_ddesc = False for i in xrange(0, vlen, bsize): # Correction for the block size if i+bsize > vlen: bsize = vlen - i # Get buffers for vars for name in dict_viewkeys(vars): var = vars[name] if hasattr(var, "__len__") and len(var) > bsize: vars_[name] = var[i:i+bsize] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block # We need array evals if vm == "python": res_block = eval(expression, vars_) dynd_block = blaze_eval(res_block).ddesc.dynd_arr() else: res_block = numexpr.evaluate(expression, local_dict=vars_) # numexpr returns a numpy array, and we need dynd/blaze ones dynd_block = nd.array(res_block) res_block = array(res_block) if i == 0: scalar = False dim_reduction = False # Detection of reduction operations if res_block.dshape.shape == (): scalar = True result = dynd_block continue elif len(res_block.dshape.shape) < maxndims: dim_reduction = True result = dynd_block continue block_shape, block_dtype = datashape.to_numpy(res_block.dshape) out_shape = list(block_shape) if res_ddesc: out_shape[0] = 0 dshape = datashape.from_numpy(out_shape, block_dtype) result = empty(dshape, **kwargs) append(result, dynd_block) else: out_shape[0] = vlen dshape = datashape.from_numpy(out_shape, block_dtype) result = empty(dshape, **kwargs) # The next is a workaround for bug #183 #result[:bsize] = res_block result[:bsize] = dynd_block else: if scalar: result += dynd_block result = result.eval() elif dim_reduction: if len(res_block) < len(result): result[:bsize] += dynd_block else: result += dynd_block result = result.eval() elif res_ddesc: append(result, dynd_block) else: # The next is a workaround for bug #183 #result[i:i+bsize] = res_block result[i:i+bsize] = dynd_block # Scalars and dim reductions generate dynd array for workaround # different issues in Blaze array operations (see #197) if isinstance(result, nd.array): if scalar: return array(result) else: # If not an scalar pass the arguments (persistency, etc.) return array(result, **kwargs) return result