def _expand_key( key: Any, shape: Tuple[int, int], ) -> Tuple[Union[int, slice], Union[int, slice], Union[str, List[str]]]: # convert to tuple key = np.index_exp[key] if len(key) > 3: raise IndexError("too many indices provided") if len(key) == 3: expanded_key = ndx.ndindex(key[:2]).expand(shape).raw + (key[2], ) else: expanded_key = ndx.ndindex(key).expand(shape).raw + ([], ) return expanded_key
def __setitem__(self, index, value): if isinstance(self.dataset, InMemoryDataset) and ndindex(index).expand(self.shape) == Tuple().expand(self.shape): new_dataset = InMemoryArrayDataset(self.name, np.broadcast_to(value, self.shape).astype(self.dtype), self.parent, fillvalue=self.fillvalue, chunks=self.chunks) new_dataset.attrs = self.dataset.attrs self.dataset = new_dataset return self.dataset.__setitem__(index, value)
def write_dataset_chunks(f, name, data_dict): """ data_dict should be a dictionary mapping chunk_size index to either an array for that chunk, or a slice into the raw data for that chunk """ if name not in f['_version_data']: raise NotImplementedError( "Use write_dataset() if the dataset does not yet exist") ds = f['_version_data'][name]['raw_data'] chunks = tuple(ds.attrs['chunks']) chunk_size = chunks[0] shape = tuple( max(c.args[i].stop for c in data_dict) for i in range(len(chunks))) all_chunks = list(split_chunks(shape, chunks)) for c in all_chunks: if c not in data_dict: raise ValueError(f"data_dict does not include all chunks ({c})") for c in data_dict: if c not in all_chunks: raise ValueError(f"data_dict contains extra chunks ({c})") hashtable = Hashtable(f, name) slices = {i: None for i in data_dict} data_to_write = {} for chunk, data_s in data_dict.items(): if not isinstance( data_s, (slice, tuple, Tuple, Slice)) and data_s.dtype != ds.dtype: raise ValueError( f"dtypes do not match ({data_s.dtype} != {ds.dtype})") idx = hashtable.largest_index if isinstance(data_s, (slice, tuple, Tuple, Slice)): slices[chunk] = ndindex(data_s) else: raw_slice = Slice(idx * chunk_size, idx * chunk_size + data_s.shape[0]) data_hash = hashtable.hash(data_s) raw_slice2 = hashtable.setdefault(data_hash, raw_slice) if raw_slice2 == raw_slice: data_to_write[raw_slice] = data_s slices[chunk] = raw_slice2 assert None not in slices.values() old_shape = ds.shape ds.resize((old_shape[0] + len(data_to_write) * chunk_size, ) + chunks[1:]) for raw_slice, data_s in data_to_write.items(): c = (raw_slice.raw, ) + tuple(slice(0, i) for i in data_s.shape[1:]) ds[c] = data_s return slices
def setup(self): self.builtin_types = [ 0, np.int64(0), [0, 1], True, False, np.array([0, 1]), np.array([True, False]), np.array(True), np.bool_(True), np.array(0), ..., slice(0, 1), None, (slice(0, 1), ..., 0) ] self.ndindex_types = [ndindex(i) for i in self.builtin_types]
def get_data( self, indexing: Tuple[BasicIndex], ) -> np.ndarray: info(f"Reading data in '{self.location}'") with h5.File(self.location, mode="r") as fp: index = ndx.ndindex(indexing[0]).reduce(self._shape) # create array as source to store quantities dset = np.empty(len(index), dtype=self._dtype) for field in self._dtype.fields: dset[field][:] = fp[field][index.raw] return dset
def __getitem__(self, index): idx = ndindex(index).reduce(self.shape) newshape = idx.newshape(self.shape) arr = np.full(newshape, self.fillvalue, dtype=self.dtype) for c in self.chunks.as_subchunks(idx, self.shape): if c not in self.data_dict: fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape)) self.data_dict[c] = fill if self.data_dict[c].size != 0: arr_idx = c.as_subindex(idx) chunk_idx = idx.as_subindex(c) arr[arr_idx.raw] = self.data_dict[c][chunk_idx.raw] # Return arr as a scalar if it is shape () (matching h5py) return arr[()]
def __getitem__(self, key: Any) -> Union["Quantity", "QuantityArray"]: index = ndx.ndindex(key).expand(self.shape).raw if isinstance(index[0], int): return Quantity.from_array( self._data[index], name=self.name, label=self.label, unit=self.unit, time=self.time, ) else: return QuantityArray.from_array( self._data[index], name=self.name, label=self.label, unit=self.unit, time=self.time, )
def __setitem__(self, index, value): self.parent._check_committed() idx = ndindex(index).reduce(self.shape) val = np.broadcast_to(value, idx.newshape(self.shape)) for c in self.chunks.as_subchunks(idx, self.shape): if c not in self.data_dict: # Broadcasted arrays do not actually consume memory fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape)) self.data_dict[c] = fill if self.data_dict[c].size != 0: val_idx = c.as_subindex(idx) if not self.data_dict[c].flags.writeable: # self.data_dict[c] is a broadcasted array from above self.data_dict[c] = self.data_dict[c].copy() chunk_idx = idx.as_subindex(c) self.data_dict[c][chunk_idx.raw] = val[val_idx.raw]
def __getitem__(self, key: Any) -> Union["GridDataset", GridArray]: # unpack axes -> remember the first axis is responsible for time slicing time_axis, *rest_axes = self.axes # unpack indexing index = ndx.ndindex(key).expand(self.shape).raw time_slicing, *axes_slicing = index # nata does not support adding a new axis if time_slicing is np.newaxis: msg = ( "creating a new axis as time axis is not supported\n" "use `.to_dask` and `.to_numpy` and convert to a GridDataset afterwards" ) raise IndexError(msg) new_axis_excluded = tuple(ind for ind in axes_slicing if ind is not None) indices_of_new_axis = tuple(i for i, ind in enumerate(axes_slicing) if ind is None) reductions = tuple(isinstance(idx, int) for idx in new_axis_excluded) axes = [time_axis[time_slicing]] for ax, ind, red in zip(rest_axes, new_axis_excluded, reductions): if not red: axes.append(ax[time_slicing, ind]) for pos in indices_of_new_axis: axes.insert(pos + 1, Axis.from_array(da.zeros( (len(time_axis), 1)))) data = self._data[index] name = self.name label = self.label unit = self.unit if axes[0].shape: return GridDataset(data, tuple(axes), name, label, unit) else: time, *axes = axes return GridArray(data, tuple(axes), time, name, label, unit)
def get_data( self, indexing: Optional[BasicIndex] = None, fields: Optional[Union[str, Sequence[str]]] = None, ) -> np.ndarray: info(f"Reading data in '{self.location}'") with h5.File(self.location, mode="r") as fp: index = (ndx.Slice(None) if indexing is None else ndx.ndindex(indexing)) index = index.reduce((self.num_particles, )) dtype = self.dtype if fields is None else self.dtype[fields] if dtype.fields: # create array as source to store quantities dset = np.empty(len(index), dtype=dtype) for field in dtype.fields: dset[field][:] = fp[field][index.raw] else: # only one field element -> string passed dset = fp[fields][index.raw] return dset
def as_subchunks(idx, shape, chunks): """ Split an index `idx` on an array of shape `shape` into subchunks assuming a chunk size of `chunks`. Yields tuples `(c, index)`, where `c` is an index for the chunks that should be sliced, and `index` is an index into that chunk giving the elements of `idx` that are included in it (`c` and `index` are both ndindex indices). That is to say, for each `(c, index)` pair yielded, `a[c][index]` will give those elements of `a[idx]` that are part of the `c` chunk. Note that this only yields those indices that are nonempty. >>> from versioned_hdf5.slicetools import as_subchunks >>> idx = (slice(5, 15), 0) >>> shape = (20, 20) >>> chunks = (10, 10) >>> for c, index in as_subchunks(idx, shape, chunks): ... print(c) ... print(' ', index) Tuple(slice(0, 10, 1), slice(0, 10, 1)) Tuple(slice(5, 10, 1), 0) Tuple(slice(10, 20, 1), slice(0, 10, 1)) Tuple(slice(0, 5, 1), 0) """ idx = ndindex(idx) for c in split_chunks(shape, chunks): try: index = idx.as_subindex(c) except ValueError: continue if not index.isempty(chunks): yield (c, index)
def __getitem__(self, key: Any) -> "GridArray": index = ndx.ndindex(key).expand(self.shape).raw new_axis_excluded = tuple(ind for ind in index if ind is not None) indices_of_new_axis = tuple(i for i, ind in enumerate(index) if ind is None) reductions = tuple(isinstance(idx, int) for idx in new_axis_excluded) axes = [] for ax, ind, red in zip(self.axes, new_axis_excluded, reductions): if not red: axes.append(ax[ind]) for pos in indices_of_new_axis: axes.insert(pos, Axis.from_array([0])) data = self._data[index] axes = tuple(axes) time = self.time name = self.name label = self.label unit = self.unit return GridArray(data, axes, time, name, label, unit)
def time_ndindex_bool(self): ndindex(False)
def time_ndindex_bool_(self): ndindex(self.bool_)
def time_ndindex_integer_array(self): ndindex(self.integer_array)
def time_ndindex_boolean_array(self): ndindex(self.boolean_array)
def time_ndindex_Ellipsis(self): ndindex(...)
def time_ndindex_newaxis(self): ndindex(None)
def time_ndindex_Slice(self): ndindex(self.slice)
def time_ndindex_tuple(self): ndindex(self.tuple)
def process_key(key, shape): key = ndindex.ndindex(key).expand(shape).raw mask = tuple(True if isinstance(k, int) else False for k in key) key = tuple(k if isinstance(k, slice) else slice(k, k+1, None) for k in key) return key, mask
def __getitem__(self, args, new_dtype=None): """ Read a slice from the HDF5 dataset. Takes slices and recarray-style field names (more than one is allowed!) in any order. Obeys basic NumPy rules, including broadcasting. """ # This boilerplate code is based on h5py.Dataset.__getitem__ args = args if isinstance(args, tuple) else (args, ) if new_dtype is None: new_dtype = getattr(self._local, 'astype', None) # Sort field names from the rest of the args. names = tuple(x for x in args if isinstance(x, str)) if names: # Read a subset of the fields in this structured dtype if len(names) == 1: names = names[0] # Read with simpler dtype of this field args = tuple(x for x in args if not isinstance(x, str)) return self.fields(names, _prior_dtype=new_dtype)[args] if new_dtype is None: new_dtype = self.dtype mtype = h5t.py_create(new_dtype) # === Special-case region references ==== if len(args) == 1 and isinstance(args[0], h5r.RegionReference): obj = h5r.dereference(args[0], self.id) if obj != self.id: raise ValueError("Region reference must point to this dataset") sid = h5r.get_region(args[0], self.id) mshape = guess_shape(sid) if mshape is None: # 0D with no data (NULL or deselected SCALAR) return Empty(new_dtype) out = np.empty(mshape, dtype=new_dtype) if out.size == 0: return out sid_out = h5s.create_simple(mshape) sid_out.select_all() self.id.read(sid_out, sid, out, mtype) return out # === END CODE FROM h5py.Dataset.__getitem__ === idx = ndindex(args).reduce(self.shape) arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C') for c, index in as_subchunks(idx, self.shape, self.chunks): if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)): raw_idx = Tuple(self.id.data_dict[c], *[slice(0, len(i)) for i in c.args[1:]]).raw a = self.id._read_chunk(raw_idx) self.id.data_dict[c] = a if self.id.data_dict[c].size != 0: arr_idx = c.as_subindex(idx) arr[arr_idx.raw] = self.id.data_dict[c][index.raw] return arr
def time_ndindex_Newaxis(self): ndindex(self.newaxis)
def time_ndindex_ellipsis(self): ndindex(self.ellipsis)
def time_ndindex_Integer(self): ndindex(self.integer)
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ self.parent._check_committed() # This boilerplate code is based on h5py.Dataset.__setitem__ args = args if isinstance(args, tuple) else (args, ) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. vlen = h5t.check_vlen_dtype(self.dtype) if vlen is not None and vlen not in (bytes, str): try: val = np.asarray(val, dtype=vlen) except ValueError: try: val = np.array([np.array(x, dtype=vlen) for x in val], dtype=self.dtype) except ValueError: pass if vlen == val.dtype: if val.ndim > 1: tmp = np.empty(shape=val.shape[:-1], dtype=object) tmp.ravel()[:] = [ i for i in val.reshape(( np.product(val.shape[:-1], dtype=np.ulonglong), val.shape[-1])) ] else: tmp = np.array([None], dtype=object) tmp[0] = val val = tmp elif self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, np.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = np.asarray(val, dtype=dtype.base, order='C') if cast_compound: val = val.view(np.dtype([(names[0], dtype)])) val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)]) else: val = np.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError( "When writing to array types, last N dimensions have to match (got %s, but should be %s)" % ( valshp, shp, )) mtype = h5t.py_create(np.dtype((val.dtype, shp))) # mshape = val.shape[0:len(val.shape)-len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: # mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError( "Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"' % x for x in mismatch) raise ValueError( "Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5t.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mtype = None # === END CODE FROM h5py.Dataset.__setitem__ === idx = ndindex(args).reduce(self.shape) val = np.broadcast_to(val, idx.newshape(self.shape)) for c, index in as_subchunks(idx, self.shape, self.chunks): if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)): raw_idx = Tuple(self.id.data_dict[c], *[slice(0, len(i)) for i in c.args[1:]]).raw a = self.id._read_chunk(raw_idx) self.id.data_dict[c] = a if self.id.data_dict[c].size != 0: val_idx = c.as_subindex(idx) self.id.data_dict[c][index.raw] = val[val_idx.raw]
def time_ndindex_int(self): ndindex(1)
def time_ndindex_slice(self): ndindex(slice(0, 4, 2))
def time_ndindex_int64(self): ndindex(self.int64)