def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") >>> images = ds["image"] >>> image = images[5] >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in the dataset") if not slice_list: self._tensors[subpath][:] = assign_value else: self._tensors[subpath][slice_list] = assign_value
def upload(self, results, ds: Dataset, token: dict, progressbar: bool = True): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. For dynamic tensors, it disable dynamicness and then enables it back. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ for key, value in results.items(): chunk = ds[key].chunksize[0] chunk = 1 if chunk == 0 else chunk value = get_value(value) value = str_to_int(value, ds.dataset.tokenizer) num_chunks = math.ceil(len(value) / (chunk * self.workers)) length = num_chunks * chunk if self.workers != 1 else len(value) batched_values = batchify(value, length) def upload_chunk(i_batch): i, batch = i_batch length = len(batch) slice_ = slice(i * length, (i + 1) * length) ds[key, slice_] = batch index_batched_values = list( zip(list(range(len(batched_values))), batched_values)) # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) offset = ds.indexes[ 0] # here ds.indexes will always be a contiguous list as obtained after slicing # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() ds.dataset._tensors[f"/{key}"].set_shape( [slice(offset, offset + len(value))], value) ds.commit() return ds
def __setitem__(self, slice_, value): """| Sets a slice of the objectview with a value""" if isinstance(slice_, slice) and (slice_.start is None and slice_.stop is None): objview = self else: objview = self.__getitem__(slice_) assign_value = get_value(value) if not isinstance(objview.dataset, DatasetView): # subpath present but no slice done assign_value = str_to_int(assign_value, objview.dataset.tokenizer) if len(objview.subpath.split("/")[1:]) > 1: raise IndexError("Can only go deeper on single datapoint") if not objview.dataset.squeeze_dim: # assign a combined tensor for multiple datapoints # only possible if the field has a fixed size assign_value = str_to_int(assign_value, objview.dataset.dataset.tokenizer) paths = objview.subpath.split("/")[1:] if len(paths) > 1: raise IndexError("Can only go deeper on single datapoint") else: # single datapoint def assign(paths, value): # helper function for recursive assign if len(paths) > 0: path = paths.pop(0) value[path] = assign(paths, value[path]) return value try: value[tuple(slice_)] = assign_value except TypeError: value = assign_value return value assign_value = str_to_int(assign_value, objview.dataset.dataset.tokenizer) paths = objview.subpath.split("/")[1:] schema = objview.schema[paths[0]] slice_ = [ of if sq else slice(of, of + num) if num else slice(None, None) for num, of, sq in zip(objview.nums, objview.offsets, objview.squeeze_dims) ] if isinstance(schema, Sequence): if isinstance(schema.dtype, SchemaDict): # if sequence of dict, have to fetch everything value = objview.dataset[paths[0]].compute() value = assign(paths[1:], value) objview.dataset[paths[0]] = value else: # sequence of tensors value = objview.dataset[paths[0]].compute() value[tuple(slice_)] = assign_value objview.dataset[paths[0]] = value
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if self.squeeze_dim else slice_list if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif not slice_list: slice_ = ( self.offset # if self.num_samples == 1 if self.squeeze_dim else slice(self.offset, self.offset + self.num_samples) ) if subpath in self.dataset._tensors.keys(): self.dataset._tensors[subpath][slice_] = assign_value # Add path check for key in self.dataset._tensors.keys(): if subpath.startswith(key): ObjectView( dataset=self.dataset, subpath=subpath, slice_list=[slice_] )[:] = assign_value # raise error else: num, ofs = ( slice_extract_info(slice_list[0], self.num_samples) if isinstance(slice_list[0], slice) else (1, slice_list[0]) ) slice_list[0] = ( slice(ofs + self.offset, ofs + self.offset + num) if isinstance(slice_list[0], slice) else ofs + self.offset ) # self.dataset._tensors[subpath][slice_list] = assign_value if subpath in self.dataset._tensors.keys(): self.dataset._tensors[subpath][ slice_list ] = assign_value # Add path check return for key in self.dataset._tensors.keys(): if subpath.startswith(key): ObjectView( dataset=self.dataset, subpath=subpath, slice_list=slice_list )[:] = assign_value
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ self.dataset._auto_checkout() assign_value = get_value(value) assign_value = str_to_int( assign_value, self.dataset.tokenizer) # handling strings and bytes if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list if not subpath: raise ValueError("Can't assign to dataset sliced without key") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in dataset") if not slice_list: slice_ = (slice(self.indexes[0], self.indexes[-1] + 1) if self.is_contiguous else self.indexes) if not isinstance(slice_, list): self.dataset._tensors[subpath][slice_] = assign_value else: for i, index in enumerate(slice_): self.dataset._tensors[subpath][index] = assign_value[i] else: if isinstance(self.indexes, list): indexes = self.indexes[slice_list[0]] if self.is_contiguous and isinstance(indexes, list) and indexes: slice_list[0] = slice(indexes[0], indexes[-1] + 1) else: slice_list[0] = indexes else: slice_list[0] = self.indexes if not isinstance(slice_list[0], list): self.dataset._tensors[subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][ current_slice] = assign_value[i]
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> images_tensorview = ds["image"] >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) if subpath: raise ValueError("Can't setitem of TensorView with subpath") new_nums = self.nums.copy() new_offsets = self.offsets.copy() if isinstance(self.indexes, list): new_indexes = self.indexes[slice_list[0]] if self.is_contiguous and new_indexes: new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) elif isinstance(self.indexes, int): new_indexes = self.indexes else: ofs = self.indexes.start or 0 num = self.indexes.stop - ofs if self.indexes.stop else None new_indexes = self._combine(slice_list[0], num, ofs) slice_list[0] = new_indexes if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(1, len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if not self.squeeze_dims[i] else new_offsets[i] ) slice_list.append(cur_slice) if isinstance(slice_list[0], (int, slice)): self.dataset._tensors[self.subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][current_slice] = assign_value[i]
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") >>> images = ds["image"] >>> image = images[5] >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") """ if "r" in self._mode: raise ReadModeException("__setitem__") self._auto_checkout() if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in the dataset") assign_value = get_value(value) schema_dict = self.schema if subpath[1:] in schema_dict.dict_.keys(): schema_key = schema_dict.dict_.get(subpath[1:], None) else: for schema_key in subpath[1:].split("/"): schema_dict = schema_dict.dict_.get(schema_key, None) if not isinstance(schema_dict, SchemaDict): schema_key = schema_dict if isinstance(schema_key, ClassLabel): assign_value = check_class_label(assign_value, schema_key) if isinstance( schema_key, (Text, bytes)) or (isinstance(assign_value, Iterable) and any( isinstance(val, str) for val in assign_value)): # handling strings and bytes assign_value = str_to_int(assign_value, self.tokenizer) if not slice_list: self._tensors[subpath][:] = assign_value else: self._tensors[subpath][slice_list] = assign_value
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> images_tensorview = ds["image"] >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) new_nums = self.nums.copy() new_offsets = self.offsets.copy() if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if new_nums[i] > 1 else new_offsets[i] ) slice_list.append(cur_slice) if subpath or ( len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence) ): ObjectView( dataset=self.dataset, subpath=self.subpath + subpath, slice_list=slice_list, )[:] = assign_value else: self.dataset._tensors[self.subpath][slice_list] = assign_value
def upload( self, results, url: str, token: dict, progressbar: bool = True, public: bool = True, ): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public Returns ---------- ds: hub.Dataset Uploaded dataset """ if len(list(results.values())) == 0: shape = (0, ) else: shape = (len(list(results.values())[0]), ) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, cache=False, public=public, ) tasks = [] for key, value in results.items(): length = ds[key].chunksize[0] value = get_value(value) value = str_to_int(value, ds.tokenizer) batched_values = batchify(value, length) chunk_id = list(range(len(batched_values))) index_batched_values = list(zip(chunk_id, batched_values)) ds._tensors[f"/{key}"].disable_dynamicness() results = [ self.upload_chunk.remote(el, key=key, ds=ds) for el in index_batched_values ] tasks.extend(results) results = ray.get(tasks) self.set_dynamic_shapes(results, ds) ds.commit() return ds
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ self.dataset._auto_checkout() if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list assign_value = get_value(value) schema_dict = self.dataset.schema if subpath[1:] in schema_dict.dict_.keys(): schema_key = schema_dict.dict_.get(subpath[1:], None) else: for schema_key in subpath[1:].split("/"): schema_dict = schema_dict.dict_.get(schema_key, None) if not isinstance(schema_dict, SchemaDict): schema_key = schema_dict if isinstance(schema_key, ClassLabel): assign_value = check_class_label(assign_value, schema_key) if isinstance( schema_key, (Text, bytes)) or (isinstance(assign_value, Iterable) and any( isinstance(val, str) for val in assign_value)): # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not subpath: raise ValueError("Can't assign to dataset sliced without key") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in dataset") if not slice_list: slice_ = (slice(self.indexes[0], self.indexes[-1] + 1) if self.is_contiguous else self.indexes) if not isinstance(slice_, list): self.dataset._tensors[subpath][slice_] = assign_value else: for i, index in enumerate(slice_): self.dataset._tensors[subpath][index] = assign_value[i] else: if isinstance(self.indexes, list): indexes = self.indexes[slice_list[0]] if self.is_contiguous and isinstance(indexes, list) and indexes: slice_list[0] = slice(indexes[0], indexes[-1] + 1) else: slice_list[0] = indexes else: slice_list[0] = self.indexes if not isinstance(slice_list[0], list): self.dataset._tensors[subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][ current_slice] = assign_value[i]