示例#1
0
    def _expand_key(
        key: Any,
        shape: Tuple[int, int],
    ) -> Tuple[Union[int, slice], Union[int, slice], Union[str, List[str]]]:
        # convert to tuple
        key = np.index_exp[key]

        if len(key) > 3:
            raise IndexError("too many indices provided")

        if len(key) == 3:
            expanded_key = ndx.ndindex(key[:2]).expand(shape).raw + (key[2], )
        else:
            expanded_key = ndx.ndindex(key).expand(shape).raw + ([], )

        return expanded_key
示例#2
0
 def __setitem__(self, index, value):
     if isinstance(self.dataset, InMemoryDataset) and ndindex(index).expand(self.shape) == Tuple().expand(self.shape):
         new_dataset = InMemoryArrayDataset(self.name,
                                            np.broadcast_to(value, self.shape).astype(self.dtype),
                                            self.parent,
                                            fillvalue=self.fillvalue, chunks=self.chunks)
         new_dataset.attrs = self.dataset.attrs
         self.dataset = new_dataset
         return
     self.dataset.__setitem__(index, value)
示例#3
0
def write_dataset_chunks(f, name, data_dict):
    """
    data_dict should be a dictionary mapping chunk_size index to either an
    array for that chunk, or a slice into the raw data for that chunk

    """
    if name not in f['_version_data']:
        raise NotImplementedError(
            "Use write_dataset() if the dataset does not yet exist")

    ds = f['_version_data'][name]['raw_data']
    chunks = tuple(ds.attrs['chunks'])
    chunk_size = chunks[0]

    shape = tuple(
        max(c.args[i].stop for c in data_dict) for i in range(len(chunks)))
    all_chunks = list(split_chunks(shape, chunks))
    for c in all_chunks:
        if c not in data_dict:
            raise ValueError(f"data_dict does not include all chunks ({c})")
    for c in data_dict:
        if c not in all_chunks:
            raise ValueError(f"data_dict contains extra chunks ({c})")

    hashtable = Hashtable(f, name)
    slices = {i: None for i in data_dict}
    data_to_write = {}
    for chunk, data_s in data_dict.items():
        if not isinstance(
                data_s,
            (slice, tuple, Tuple, Slice)) and data_s.dtype != ds.dtype:
            raise ValueError(
                f"dtypes do not match ({data_s.dtype} != {ds.dtype})")

        idx = hashtable.largest_index
        if isinstance(data_s, (slice, tuple, Tuple, Slice)):
            slices[chunk] = ndindex(data_s)
        else:
            raw_slice = Slice(idx * chunk_size,
                              idx * chunk_size + data_s.shape[0])
            data_hash = hashtable.hash(data_s)
            raw_slice2 = hashtable.setdefault(data_hash, raw_slice)
            if raw_slice2 == raw_slice:
                data_to_write[raw_slice] = data_s
            slices[chunk] = raw_slice2

    assert None not in slices.values()
    old_shape = ds.shape
    ds.resize((old_shape[0] + len(data_to_write) * chunk_size, ) + chunks[1:])
    for raw_slice, data_s in data_to_write.items():
        c = (raw_slice.raw, ) + tuple(slice(0, i) for i in data_s.shape[1:])
        ds[c] = data_s
    return slices
示例#4
0
 def setup(self):
     self.builtin_types = [
         0,
         np.int64(0), [0, 1], True, False,
         np.array([0, 1]),
         np.array([True, False]),
         np.array(True),
         np.bool_(True),
         np.array(0), ...,
         slice(0, 1), None, (slice(0, 1), ..., 0)
     ]
     self.ndindex_types = [ndindex(i) for i in self.builtin_types]
示例#5
0
    def get_data(
        self,
        indexing: Tuple[BasicIndex],
    ) -> np.ndarray:
        info(f"Reading data in '{self.location}'")
        with h5.File(self.location, mode="r") as fp:
            index = ndx.ndindex(indexing[0]).reduce(self._shape)
            # create array as source to store quantities
            dset = np.empty(len(index), dtype=self._dtype)

            for field in self._dtype.fields:
                dset[field][:] = fp[field][index.raw]
        return dset
示例#6
0
    def __getitem__(self, index):
        idx = ndindex(index).reduce(self.shape)

        newshape = idx.newshape(self.shape)
        arr = np.full(newshape, self.fillvalue, dtype=self.dtype)

        for c in self.chunks.as_subchunks(idx, self.shape):
            if c not in self.data_dict:
                fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape))
                self.data_dict[c] = fill

            if self.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                chunk_idx = idx.as_subindex(c)
                arr[arr_idx.raw] = self.data_dict[c][chunk_idx.raw]

        # Return arr as a scalar if it is shape () (matching h5py)
        return arr[()]
示例#7
0
    def __getitem__(self, key: Any) -> Union["Quantity", "QuantityArray"]:
        index = ndx.ndindex(key).expand(self.shape).raw

        if isinstance(index[0], int):
            return Quantity.from_array(
                self._data[index],
                name=self.name,
                label=self.label,
                unit=self.unit,
                time=self.time,
            )

        else:
            return QuantityArray.from_array(
                self._data[index],
                name=self.name,
                label=self.label,
                unit=self.unit,
                time=self.time,
            )
示例#8
0
    def __setitem__(self, index, value):
        self.parent._check_committed()

        idx = ndindex(index).reduce(self.shape)

        val = np.broadcast_to(value, idx.newshape(self.shape))

        for c in self.chunks.as_subchunks(idx, self.shape):
            if c not in self.data_dict:
                # Broadcasted arrays do not actually consume memory
                fill = np.broadcast_to(self.fillvalue, c.newshape(self.shape))
                self.data_dict[c] = fill

            if self.data_dict[c].size != 0:
                val_idx = c.as_subindex(idx)
                if not self.data_dict[c].flags.writeable:
                    # self.data_dict[c] is a broadcasted array from above
                    self.data_dict[c] = self.data_dict[c].copy()
                chunk_idx = idx.as_subindex(c)
                self.data_dict[c][chunk_idx.raw] = val[val_idx.raw]
示例#9
0
    def __getitem__(self, key: Any) -> Union["GridDataset", GridArray]:
        # unpack axes -> remember the first axis is responsible for time slicing
        time_axis, *rest_axes = self.axes

        # unpack indexing
        index = ndx.ndindex(key).expand(self.shape).raw
        time_slicing, *axes_slicing = index

        # nata does not support adding a new axis
        if time_slicing is np.newaxis:
            msg = (
                "creating a new axis as time axis is not supported\n"
                "use `.to_dask` and `.to_numpy` and convert to a GridDataset afterwards"
            )
            raise IndexError(msg)

        new_axis_excluded = tuple(ind for ind in axes_slicing
                                  if ind is not None)
        indices_of_new_axis = tuple(i for i, ind in enumerate(axes_slicing)
                                    if ind is None)
        reductions = tuple(isinstance(idx, int) for idx in new_axis_excluded)

        axes = [time_axis[time_slicing]]
        for ax, ind, red in zip(rest_axes, new_axis_excluded, reductions):
            if not red:
                axes.append(ax[time_slicing, ind])

        for pos in indices_of_new_axis:
            axes.insert(pos + 1, Axis.from_array(da.zeros(
                (len(time_axis), 1))))

        data = self._data[index]
        name = self.name
        label = self.label
        unit = self.unit

        if axes[0].shape:
            return GridDataset(data, tuple(axes), name, label, unit)
        else:
            time, *axes = axes
            return GridArray(data, tuple(axes), time, name, label, unit)
示例#10
0
文件: hdf5.py 项目: fyli16/nata
    def get_data(
        self,
        indexing: Optional[BasicIndex] = None,
        fields: Optional[Union[str, Sequence[str]]] = None,
    ) -> np.ndarray:
        info(f"Reading data in '{self.location}'")
        with h5.File(self.location, mode="r") as fp:
            index = (ndx.Slice(None)
                     if indexing is None else ndx.ndindex(indexing))
            index = index.reduce((self.num_particles, ))
            dtype = self.dtype if fields is None else self.dtype[fields]

            if dtype.fields:
                # create array as source to store quantities
                dset = np.empty(len(index), dtype=dtype)

                for field in dtype.fields:
                    dset[field][:] = fp[field][index.raw]
            else:
                # only one field element -> string passed
                dset = fp[fields][index.raw]

        return dset
示例#11
0
def as_subchunks(idx, shape, chunks):
    """
    Split an index `idx` on an array of shape `shape` into subchunks assuming
    a chunk size of `chunks`.

    Yields tuples `(c, index)`, where `c` is an index for the chunks that
    should be sliced, and `index` is an index into that chunk giving the
    elements of `idx` that are included in it (`c` and `index` are both
    ndindex indices).

    That is to say, for each `(c, index)` pair yielded, `a[c][index]` will
    give those elements of `a[idx]` that are part of the `c` chunk.

    Note that this only yields those indices that are nonempty.

    >>> from versioned_hdf5.slicetools import as_subchunks
    >>> idx = (slice(5, 15), 0)
    >>> shape = (20, 20)
    >>> chunks = (10, 10)
    >>> for c, index in as_subchunks(idx, shape, chunks):
    ...     print(c)
    ...     print('    ', index)
    Tuple(slice(0, 10, 1), slice(0, 10, 1))
        Tuple(slice(5, 10, 1), 0)
    Tuple(slice(10, 20, 1), slice(0, 10, 1))
        Tuple(slice(0, 5, 1), 0)

    """
    idx = ndindex(idx)
    for c in split_chunks(shape, chunks):
        try:
            index = idx.as_subindex(c)
        except ValueError:
            continue

        if not index.isempty(chunks):
            yield (c, index)
示例#12
0
    def __getitem__(self, key: Any) -> "GridArray":
        index = ndx.ndindex(key).expand(self.shape).raw

        new_axis_excluded = tuple(ind for ind in index if ind is not None)
        indices_of_new_axis = tuple(i for i, ind in enumerate(index)
                                    if ind is None)
        reductions = tuple(isinstance(idx, int) for idx in new_axis_excluded)

        axes = []
        for ax, ind, red in zip(self.axes, new_axis_excluded, reductions):
            if not red:
                axes.append(ax[ind])

        for pos in indices_of_new_axis:
            axes.insert(pos, Axis.from_array([0]))

        data = self._data[index]
        axes = tuple(axes)
        time = self.time
        name = self.name
        label = self.label
        unit = self.unit

        return GridArray(data, axes, time, name, label, unit)
示例#13
0
 def time_ndindex_bool(self):
     ndindex(False)
示例#14
0
 def time_ndindex_bool_(self):
     ndindex(self.bool_)
示例#15
0
 def time_ndindex_integer_array(self):
     ndindex(self.integer_array)
示例#16
0
 def time_ndindex_boolean_array(self):
     ndindex(self.boolean_array)
示例#17
0
 def time_ndindex_Ellipsis(self):
     ndindex(...)
示例#18
0
 def time_ndindex_newaxis(self):
     ndindex(None)
示例#19
0
 def time_ndindex_Slice(self):
     ndindex(self.slice)
示例#20
0
 def time_ndindex_tuple(self):
     ndindex(self.tuple)
示例#21
0
def process_key(key, shape):
    key = ndindex.ndindex(key).expand(shape).raw
    mask = tuple(True if isinstance(k, int) else False for k in key)
    key = tuple(k if isinstance(k, slice) else slice(k, k+1, None) for k in key)
    return key, mask
示例#22
0
    def __getitem__(self, args, new_dtype=None):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        """
        # This boilerplate code is based on h5py.Dataset.__getitem__
        args = args if isinstance(args, tuple) else (args, )

        if new_dtype is None:
            new_dtype = getattr(self._local, 'astype', None)

        # Sort field names from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))

        if names:
            # Read a subset of the fields in this structured dtype
            if len(names) == 1:
                names = names[0]  # Read with simpler dtype of this field
            args = tuple(x for x in args if not isinstance(x, str))
            return self.fields(names, _prior_dtype=new_dtype)[args]

        if new_dtype is None:
            new_dtype = self.dtype
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = guess_shape(sid)
            if mshape is None:
                # 0D with no data (NULL or deselected SCALAR)
                return Empty(new_dtype)
            out = np.empty(mshape, dtype=new_dtype)
            if out.size == 0:
                return out

            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === END CODE FROM h5py.Dataset.__getitem__ ===

        idx = ndindex(args).reduce(self.shape)

        arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C')

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                arr[arr_idx.raw] = self.id.data_dict[c][index.raw]

        return arr
示例#23
0
 def time_ndindex_Newaxis(self):
     ndindex(self.newaxis)
示例#24
0
 def time_ndindex_ellipsis(self):
     ndindex(self.ellipsis)
示例#25
0
 def time_ndindex_Integer(self):
     ndindex(self.integer)
示例#26
0
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        self.parent._check_committed()
        # This boilerplate code is based on h5py.Dataset.__setitem__
        args = args if isinstance(args, tuple) else (args, )

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        vlen = h5t.check_vlen_dtype(self.dtype)
        if vlen is not None and vlen not in (bytes, str):
            try:
                val = np.asarray(val, dtype=vlen)
            except ValueError:
                try:
                    val = np.array([np.array(x, dtype=vlen) for x in val],
                                   dtype=self.dtype)
                except ValueError:
                    pass
            if vlen == val.dtype:
                if val.ndim > 1:
                    tmp = np.empty(shape=val.shape[:-1], dtype=object)
                    tmp.ravel()[:] = [
                        i for i in val.reshape((
                            np.product(val.shape[:-1], dtype=np.ulonglong),
                            val.shape[-1]))
                    ]
                else:
                    tmp = np.array([None], dtype=object)
                    tmp[0] = val
                val = tmp
        elif self.dtype.kind == "O" or \
          (self.dtype.kind == 'V' and \
          (not isinstance(val, np.ndarray) or val.dtype.kind != 'V') and \
          (self.dtype.subdtype == None)):
            if len(names) == 1 and self.dtype.fields is not None:
                # Single field selected for write, from a non-array source
                if not names[0] in self.dtype.fields:
                    raise ValueError("No such field for indexing: %s" %
                                     names[0])
                dtype = self.dtype.fields[names[0]][0]
                cast_compound = True
            else:
                dtype = self.dtype
                cast_compound = False

            val = np.asarray(val, dtype=dtype.base, order='C')
            if cast_compound:
                val = val.view(np.dtype([(names[0], dtype)]))
                val = val.reshape(val.shape[:len(val.shape) -
                                            len(dtype.shape)])
        else:
            val = np.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            valshp = val.shape[-len(shp):]
            if valshp != shp:  # Last dimension has to match
                raise TypeError(
                    "When writing to array types, last N dimensions have to match (got %s, but should be %s)"
                    % (
                        valshp,
                        shp,
                    ))
            mtype = h5t.py_create(np.dtype((val.dtype, shp)))
            # mshape = val.shape[0:len(val.shape)-len(shp)]

        # Make a compound memory type if field-name slicing is required
        elif len(names) != 0:

            # mshape = val.shape

            # Catch common errors
            if self.dtype.fields is None:
                raise TypeError(
                    "Illegal slicing argument (not a compound dataset)")
            mismatch = [x for x in names if x not in self.dtype.fields]
            if len(mismatch) != 0:
                mismatch = ", ".join('"%s"' % x for x in mismatch)
                raise ValueError(
                    "Illegal slicing argument (fields %s not in dataset type)"
                    % mismatch)

            # Write non-compound source into a single dataset field
            if len(names) == 1 and val.dtype.fields is None:
                subtype = h5t.py_create(val.dtype)
                mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
                mtype.insert(self._e(names[0]), 0, subtype)

            # Make a new source type keeping only the requested fields
            else:
                fieldnames = [x for x in val.dtype.names
                              if x in names]  # Keep source order
                mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
                for fieldname in fieldnames:
                    subtype = h5t.py_create(val.dtype.fields[fieldname][0])
                    offset = val.dtype.fields[fieldname][1]
                    mtype.insert(self._e(fieldname), offset, subtype)

        # Use mtype derived from array (let DatasetID.write figure it out)
        else:
            mtype = None

        # === END CODE FROM h5py.Dataset.__setitem__ ===

        idx = ndindex(args).reduce(self.shape)

        val = np.broadcast_to(val, idx.newshape(self.shape))

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                val_idx = c.as_subindex(idx)
                self.id.data_dict[c][index.raw] = val[val_idx.raw]
示例#27
0
 def time_ndindex_int(self):
     ndindex(1)
示例#28
0
 def time_ndindex_slice(self):
     ndindex(slice(0, 4, 2))
示例#29
0
 def time_ndindex_int64(self):
     ndindex(self.int64)