def __getitem__(self, slice_): """| Get an item to be computed without iterating on the whole dataset. | Creates a dataset view, then a temporary dataset to apply the transform. Parameters: ---------- slice_: slice Gets a slice or slices from dataset """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if len(slice_list) == 0: slice_list = [slice(None, None, None)] num, ofs = slice_extract_info(slice_list[0], self.shape[0]) ds_view = DatasetView( dataset=self._ds, num_samples=num, offset=ofs, squeeze_dim=isinstance(slice_list[0], int), ) path = posixpath.expanduser("~/.activeloop/tmparray") new_ds = self.store(path, length=num, ds=ds_view, progressbar=False) index = 1 if len(slice_) > 1 else 0 slice_[index] = (slice(None, None, None) if not isinstance(slice_list[0], int) else 0 ) # Get all shape dimension since we already sliced return new_ds[slice_]
def filter(self, fn): """| Applies a function on each element one by one as a filter to get a new DatasetView Parameters ---------- fn: function Should take in a single sample of the dataset and return True or False This function is applied to all the items of the datasetview and retains those items that return True """ indexes = [index for index in self.indexes if fn(self[index])] return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)
def __getitem__(self, slice_): """| Gets a slice or slices from dataset | Usage: >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array >>> images = ds["image"] >>> return images[5].compute() # returns numpy array >>> images = ds["image"] >>> image = images[5] >>> return image[0:1920, 0:1080, 0:3].compute() """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without subpath" ) num, ofs = slice_extract_info(slice_list[0], self.shape[0]) return DatasetView( dataset=self, num_samples=num, offset=ofs, squeeze_dim=isinstance(slice_list[0], int), lazy=self.lazy, ) elif not slice_list: if subpath in self._tensors.keys(): tensorview = TensorView( dataset=self, subpath=subpath, slice_=slice(0, self.shape[0]), lazy=self.lazy, ) if self.lazy: return tensorview else: return tensorview.compute() return self._get_dictionary(subpath) else: num, ofs = slice_extract_info(slice_list[0], self.shape[0]) if subpath in self._tensors.keys(): tensorview = TensorView(dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy) if self.lazy: return tensorview else: return tensorview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def filter(self, dic): """| Applies a filter to get a new datasetview that matches the dictionary provided Parameters ---------- dic: dictionary A dictionary of key value pairs, used to filter the dataset. For nested schemas use flattened dictionary representation i.e instead of {"abc": {"xyz" : 5}} use {"abc/xyz" : 5} """ indexes = self.indexes for k, v in dic.items(): k = k if k.startswith("/") else "/" + k if k not in self.keys: raise KeyError(f"Key {k} not found in the dataset") tsv = self[k] max_shape = tsv.dtype.max_shape prod = _tuple_product(max_shape) if prod > 100: raise LargeShapeFilteringException(k) indexes = [index for index in indexes if tsv[index].compute() == v] return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)
def __getitem__(self, slice_): """| Gets a slice or slices from dataset | Usage: >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array >>> images = ds["image"] >>> return images[5].compute() # returns numpy array >>> images = ds["image"] >>> image = images[5] >>> return image[0:1920, 0:1080, 0:3].compute() """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without key") indexes = self.indexes[slice_list[0]] return DatasetView( dataset=self, indexes=indexes, lazy=self.lazy, ) elif not slice_list: if subpath in self.keys: tensorview = TensorView( dataset=self, subpath=subpath, slice_=slice(0, self._shape[0]), lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self, subpath=subpath, lazy=self.lazy, slice_=[slice(0, self._shape[0])], ) return objectview if self.lazy else objectview.compute() return self._get_dictionary(subpath) else: schema_obj = self.schema.dict_[subpath.split("/")[1]] if subpath in self.keys and (not isinstance(schema_obj, Sequence) or len(slice_list) <= 1): tensorview = TensorView(dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __getitem__(self, slice_): """| Gets a slice from an objectview""" if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) dataset = self.dataset nums, offsets, squeeze_dims, inner_schema_obj = ( self.nums.copy(), self.offsets.copy(), self.squeeze_dims.copy(), self.inner_schema_obj, ) if subpath: inner_schema_obj, nums, offsets, squeeze_dims = self.process_path( subpath, inner_schema_obj, nums, offsets, squeeze_dims) subpath = self.subpath + subpath if len(slice_list) >= 1: # Slice first dim if isinstance(self.dataset, DatasetView) and not self.dataset.squeeze_dim: dataset = self.dataset[slice_list[0]] slice_list = slice_list[1:] elif not isinstance(self.dataset, DatasetView): num, ofs = slice_extract_info(slice_list[0], self.dataset.shape[0]) dataset = DatasetView(self.dataset, num, ofs, isinstance(slice_list[0], int)) slice_list = slice_list[1:] # Expand slice list for rest of dims if len(slice_list) >= 1: exp_slice_list = [] for squeeze in squeeze_dims: if squeeze: exp_slice_list += [None] else: if len(slice_list) > 0: exp_slice_list += [slice_list.pop(0)] else: # slice list smaller than max exp_slice_list += [None] if len(slice_list) > 0: # slice list longer than max raise IndexError("Too many indices") for i, it in enumerate(exp_slice_list): if it is not None: num, ofs = slice_extract_info(it, nums[i]) nums[i] = num offsets[i] += ofs squeeze_dims[i] = num == 1 objectview = ObjectView( dataset=dataset, subpath=subpath, slice_list=None, nums=nums, offsets=offsets, squeeze_dims=squeeze_dims, inner_schema_obj=inner_schema_obj, lazy=self.lazy, new=False, ) return objectview if self.lazy else objectview.compute()
def __init__( self, dataset, subpath=None, slice_list=None, nums=[], offsets=[], squeeze_dims=[], inner_schema_obj=None, lazy=True, new=True, ): """Creates an ObjectView object for dataset from a Dataset, DatasetView or TensorView object, or creates a different ObjectView from an existing one Parameters ---------- These parameters are used to create a new ObjectView. dataset: hub.api.dataset.Dataset object The dataset whose ObjectView is being created, or its DatasetView subpath: str (optional) A potentially incomplete path to any element in the Dataset slice_list: optional The `slice_` of this Tensor that needs to be accessed lazy: bool, optional Setting this to False will stop lazy computation and will allow items to be accessed without .compute() These parameters are also needed to create an ObjectView from an existing one. nums: List[int] Number of elements in each dimension of the ObjectView to be created offsets: List[int] Starting element in each dimension of the ObjectView to be created squeeze_dims: List[bool] Whether each dimension can be squeezed or not inner_schema_obj: Child of hub.schema.Tensor or hub.schema.SchemaDict The deepest element in the schema upto which the previous ObjectView had been processed new: bool Whether to create a new ObjectView object from a Dataset, DatasetView or TensorView or create a different ObjectView from an existing one """ self.dataset = dataset self.schema = (dataset.schema.dict_ if not isinstance(dataset, DatasetView) else dataset.dataset.schema.dict_) self.subpath = subpath self.nums = nums self.offsets = offsets self.squeeze_dims = squeeze_dims self.inner_schema_obj = inner_schema_obj self.lazy = lazy if new: # Creating new obj if self.subpath: ( self.inner_schema_obj, self.nums, self.offsets, self.squeeze_dims, ) = self.process_path( self.subpath, self.inner_schema_obj, self.nums.copy(), self.offsets.copy(), self.squeeze_dims.copy(), ) # Check if dataset view needs to be made if slice_list and len(slice_list) >= 1: num, ofs = slice_extract_info(slice_list[0], dataset.shape[0]) self.dataset = DatasetView(dataset, num, ofs, isinstance(slice_list[0], int)) if slice_list and len(slice_list) > 1: slice_list = slice_list[1:] if len(slice_list) > len(self.nums): raise IndexError("Too many indices") for i, it in enumerate(slice_list): num, ofs = slice_extract_info(it, self.nums[i]) self.nums[i] = num self.offsets[i] += ofs self.squeeze_dims[i] = num == 1