Пример #1
0
    def __setitem__(self, slice_, value):
        """| Sets a slice of the objectview with a value"""
        if isinstance(slice_, slice) and (slice_.start is None
                                          and slice_.stop is None):
            objview = self
        else:
            objview = self.__getitem__(slice_)
        assign_value = get_value(value)

        if not isinstance(objview.dataset, DatasetView):
            # subpath present but no slice done
            assign_value = str_to_int(assign_value, objview.dataset.tokenizer)
            if len(objview.subpath.split("/")[1:]) > 1:
                raise IndexError("Can only go deeper on single datapoint")
        if not objview.dataset.squeeze_dim:
            # assign a combined tensor for multiple datapoints
            # only possible if the field has a fixed size
            assign_value = str_to_int(assign_value,
                                      objview.dataset.dataset.tokenizer)
            paths = objview.subpath.split("/")[1:]
            if len(paths) > 1:
                raise IndexError("Can only go deeper on single datapoint")
        else:
            # single datapoint
            def assign(paths, value):
                # helper function for recursive assign
                if len(paths) > 0:
                    path = paths.pop(0)
                    value[path] = assign(paths, value[path])
                    return value
                try:
                    value[tuple(slice_)] = assign_value
                except TypeError:
                    value = assign_value
                return value

            assign_value = str_to_int(assign_value,
                                      objview.dataset.dataset.tokenizer)
            paths = objview.subpath.split("/")[1:]
            schema = objview.schema[paths[0]]
            slice_ = [
                of if sq else slice(of, of +
                                    num) if num else slice(None, None)
                for num, of, sq in zip(objview.nums, objview.offsets,
                                       objview.squeeze_dims)
            ]
            if isinstance(schema, Sequence):
                if isinstance(schema.dtype, SchemaDict):
                    # if sequence of dict, have to fetch everything
                    value = objview.dataset[paths[0]].compute()
                    value = assign(paths[1:], value)
                    objview.dataset[paths[0]] = value
                else:
                    # sequence of tensors
                    value = objview.dataset[paths[0]].compute()
                    value[tuple(slice_)] = assign_value
                    objview.dataset[paths[0]] = value
Пример #2
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:
        >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
        >>> images = ds["image"]
        >>> image = images[5]
        >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
        """
        assign_value = get_value(value)
        # handling strings and bytes
        assign_value = str_to_int(assign_value, self.tokenizer)

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)

        if not subpath:
            raise ValueError("Can't assign to dataset sliced without subpath")
        elif subpath not in self.keys:
            raise KeyError(f"Key {subpath} not found in the dataset")

        if not slice_list:
            self._tensors[subpath][:] = assign_value
        else:
            self._tensors[subpath][slice_list] = assign_value
Пример #3
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> images_tensorview = ds["image"]
        >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image
        """
        # handling strings and bytes
        assign_value = value
        assign_value = str_to_int(assign_value, self.dataset.tokenizer)

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        slice_ = self.slice_fill(slice_)
        subpath, slice_list = slice_split(slice_)

        if subpath:
            raise ValueError(
                "Can't slice a Tensor with multiple slices without subpath")
        else:
            new_nums = self.nums.copy()
            new_offsets = self.offsets.copy()
            if len(new_nums) < len(slice_list):
                new_nums.extend([None] * (len(slice_list) - len(new_nums)))
                new_offsets.extend([0] * (len(slice_list) - len(new_offsets)))
            for i in range(len(slice_list)):
                slice_list[i] = self._combine(slice_[i], new_nums[i],
                                              new_offsets[i])
            for i in range(len(slice_list), len(new_nums)):
                cur_slice = (slice(new_offsets[i], new_offsets[i] +
                                   new_nums[i])
                             if new_nums[i] > 1 else new_offsets[i])
                slice_list.append(cur_slice)
            self.dataset._tensors[self.subpath][slice_list] = value
Пример #4
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> ds_view = ds[5:15]
        >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image
        """
        # handling strings and bytes
        assign_value = value
        assign_value = str_to_int(assign_value, self.dataset.tokenizer)

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)
        slice_list = [0] + slice_list if self.squeeze_dim else slice_list
        if not subpath:
            raise ValueError("Can't assign to dataset sliced without subpath")
        elif not slice_list:
            slice_ = (self.offset if self.num_samples == 1 else slice(
                self.offset, self.offset + self.num_samples))
            self.dataset._tensors[subpath][
                slice_] = assign_value  # Add path check
        else:
            num, ofs = (slice_extract_info(slice_list[0], self.num_samples)
                        if isinstance(slice_list[0], slice) else
                        (1, slice_list[0]))
            slice_list[0] = (slice(ofs + self.offset, ofs + self.offset +
                                   num) if num > 1 else ofs + self.offset)
            self.dataset._tensors[subpath][slice_list] = assign_value
Пример #5
0
    def upload(self,
               results,
               ds: Dataset,
               token: dict,
               progressbar: bool = True):
        """Batchified upload of results
        For each tensor batchify based on its chunk and upload
        If tensor is dynamic then still upload element by element
        For dynamic tensors, it disable dynamicness and then enables it back

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """

        for key, value in results.items():

            length = ds[key].chunksize[0]
            value = str_to_int(value, ds.dataset.tokenizer)

            if length == 0:
                length = 1

            batched_values = batchify(value, length)

            def upload_chunk(i_batch):
                i, batch = i_batch
                batch_length = len(batch)
                if batch_length != 1:
                    ds[key, i * length:i * length + batch_length] = batch
                else:
                    ds[key, i * length] = batch[0]

            index_batched_values = list(
                zip(list(range(len(batched_values))), batched_values))

            # Disable dynamic arrays
            ds.dataset._tensors[f"/{key}"].disable_dynamicness()
            list(self.map(upload_chunk, index_batched_values))

            # Enable and rewrite shapes
            if ds.dataset._tensors[f"/{key}"].is_dynamic:
                ds.dataset._tensors[f"/{key}"].enable_dynamicness()
                [
                    ds.dataset._tensors[f"/{key}"].set_shape([i + ds.offset],
                                                             v)
                    for i, v in enumerate(value)
                ]

        ds.commit()
        return ds
Пример #6
0
    def upload(self,
               results,
               ds: Dataset,
               token: dict,
               progressbar: bool = True):
        """Batchified upload of results.
        For each tensor batchify based on its chunk and upload.
        If tensor is dynamic then still upload element by element.
        For dynamic tensors, it disable dynamicness and then enables it back.

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """

        for key, value in results.items():

            chunk = ds[key].chunksize[0]
            chunk = 1 if chunk == 0 else chunk
            value = get_value(value)
            value = str_to_int(value, ds.dataset.tokenizer)

            num_chunks = math.ceil(len(value) / (chunk * self.workers))
            length = num_chunks * chunk if self.workers != 1 else len(value)
            batched_values = batchify(value, length)

            def upload_chunk(i_batch):
                i, batch = i_batch
                length = len(batch)
                slice_ = slice(i * length, (i + 1) * length)
                ds[key, slice_] = batch

            index_batched_values = list(
                zip(list(range(len(batched_values))), batched_values))

            # Disable dynamic arrays
            ds.dataset._tensors[f"/{key}"].disable_dynamicness()
            list(self.map(upload_chunk, index_batched_values))
            offset = ds.indexes[
                0]  # here ds.indexes will always be a contiguous list as obtained after slicing

            # Enable and rewrite shapes
            if ds.dataset._tensors[f"/{key}"].is_dynamic:
                ds.dataset._tensors[f"/{key}"].enable_dynamicness()
                ds.dataset._tensors[f"/{key}"].set_shape(
                    [slice(offset, offset + len(value))], value)

        ds.commit()
        return ds
Пример #7
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> ds_view = ds[5:15]
        >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image
        """
        self.dataset._auto_checkout()
        assign_value = get_value(value)
        assign_value = str_to_int(
            assign_value, self.dataset.tokenizer)  # handling strings and bytes

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)
        slice_list = [0] + slice_list if isinstance(self.indexes,
                                                    int) else slice_list

        if not subpath:
            raise ValueError("Can't assign to dataset sliced without key")
        elif subpath not in self.keys:
            raise KeyError(f"Key {subpath} not found in dataset")

        if not slice_list:
            slice_ = (slice(self.indexes[0], self.indexes[-1] +
                            1) if self.is_contiguous else self.indexes)
            if not isinstance(slice_, list):
                self.dataset._tensors[subpath][slice_] = assign_value
            else:
                for i, index in enumerate(slice_):
                    self.dataset._tensors[subpath][index] = assign_value[i]
        else:
            if isinstance(self.indexes, list):
                indexes = self.indexes[slice_list[0]]
                if self.is_contiguous and isinstance(indexes,
                                                     list) and indexes:
                    slice_list[0] = slice(indexes[0], indexes[-1] + 1)
                else:
                    slice_list[0] = indexes
            else:
                slice_list[0] = self.indexes

            if not isinstance(slice_list[0], list):
                self.dataset._tensors[subpath][slice_list] = assign_value
            else:
                for i, index in enumerate(slice_list[0]):
                    current_slice = [index] + slice_list[1:]
                    self.dataset._tensors[subpath][
                        current_slice] = assign_value[i]
Пример #8
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> images_tensorview = ds["image"]
        >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image
        """
        assign_value = get_value(value)
        # handling strings and bytes
        assign_value = str_to_int(assign_value, self.dataset.tokenizer)

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        slice_ = self.slice_fill(slice_)
        subpath, slice_list = slice_split(slice_)
        if subpath:
            raise ValueError("Can't setitem of TensorView with subpath")
        new_nums = self.nums.copy()
        new_offsets = self.offsets.copy()
        if isinstance(self.indexes, list):
            new_indexes = self.indexes[slice_list[0]]
            if self.is_contiguous and new_indexes:
                new_indexes = slice(new_indexes[0], new_indexes[-1] + 1)
        elif isinstance(self.indexes, int):
            new_indexes = self.indexes
        else:
            ofs = self.indexes.start or 0
            num = self.indexes.stop - ofs if self.indexes.stop else None
            new_indexes = self._combine(slice_list[0], num, ofs)
        slice_list[0] = new_indexes
        if len(new_nums) < len(slice_list):
            new_nums.extend([None] * (len(slice_list) - len(new_nums)))
            new_offsets.extend([0] * (len(slice_list) - len(new_offsets)))
        for i in range(1, len(slice_list)):
            slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i])
        for i in range(len(slice_list), len(new_nums)):
            cur_slice = (
                slice(new_offsets[i], new_offsets[i] + new_nums[i])
                if not self.squeeze_dims[i]
                else new_offsets[i]
            )
            slice_list.append(cur_slice)

        if isinstance(slice_list[0], (int, slice)):
            self.dataset._tensors[self.subpath][slice_list] = assign_value
        else:
            for i, index in enumerate(slice_list[0]):
                current_slice = [index] + slice_list[1:]
                self.dataset._tensors[subpath][current_slice] = assign_value[i]
Пример #9
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:
        >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
        >>> images = ds["image"]
        >>> image = images[5]
        >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
        """
        if "r" in self._mode:
            raise ReadModeException("__setitem__")
        self._auto_checkout()

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)

        if not subpath:
            raise ValueError("Can't assign to dataset sliced without subpath")
        elif subpath not in self.keys:
            raise KeyError(f"Key {subpath} not found in the dataset")

        assign_value = get_value(value)
        schema_dict = self.schema
        if subpath[1:] in schema_dict.dict_.keys():
            schema_key = schema_dict.dict_.get(subpath[1:], None)
        else:
            for schema_key in subpath[1:].split("/"):
                schema_dict = schema_dict.dict_.get(schema_key, None)
                if not isinstance(schema_dict, SchemaDict):
                    schema_key = schema_dict
        if isinstance(schema_key, ClassLabel):
            assign_value = check_class_label(assign_value, schema_key)
        if isinstance(
                schema_key,
            (Text, bytes)) or (isinstance(assign_value, Iterable) and any(
                isinstance(val, str) for val in assign_value)):
            # handling strings and bytes
            assign_value = str_to_int(assign_value, self.tokenizer)

        if not slice_list:
            self._tensors[subpath][:] = assign_value
        else:
            self._tensors[subpath][slice_list] = assign_value
Пример #10
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> images_tensorview = ds["image"]
        >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image
        """
        assign_value = get_value(value)
        # handling strings and bytes
        assign_value = str_to_int(assign_value, self.dataset.tokenizer)

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        slice_ = self.slice_fill(slice_)
        subpath, slice_list = slice_split(slice_)
        new_nums = self.nums.copy()
        new_offsets = self.offsets.copy()
        if len(new_nums) < len(slice_list):
            new_nums.extend([None] * (len(slice_list) - len(new_nums)))
            new_offsets.extend([0] * (len(slice_list) - len(new_offsets)))
        for i in range(len(slice_list)):
            slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i])
        for i in range(len(slice_list), len(new_nums)):
            cur_slice = (
                slice(new_offsets[i], new_offsets[i] + new_nums[i])
                if new_nums[i] > 1
                else new_offsets[i]
            )
            slice_list.append(cur_slice)
        if subpath or (
            len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence)
        ):
            ObjectView(
                dataset=self.dataset,
                subpath=self.subpath + subpath,
                slice_list=slice_list,
            )[:] = assign_value
        else:
            self.dataset._tensors[self.subpath][slice_list] = assign_value
Пример #11
0
Файл: ray.py Проект: x213212/Hub
    def upload(
        self,
        results,
        url: str,
        token: dict,
        progressbar: bool = True,
        public: bool = True,
    ):
        """Batchified upload of results.
        For each tensor batchify based on its chunk and upload.
        If tensor is dynamic then still upload element by element.

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        public: bool, optional
            only applicable if using hub storage, ignored otherwise
            setting this to False allows only the user who created it to access the dataset and
            the dataset won't be visible in the visualizer to the public
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """
        if len(list(results.values())) == 0:
            shape = (0, )
        else:
            shape = (len(list(results.values())[0]), )

        ds = Dataset(
            url,
            mode="w",
            shape=shape,
            schema=self.schema,
            token=token,
            cache=False,
            public=public,
        )

        tasks = []
        for key, value in results.items():

            length = ds[key].chunksize[0]
            value = get_value(value)
            value = str_to_int(value, ds.tokenizer)
            batched_values = batchify(value, length)
            chunk_id = list(range(len(batched_values)))
            index_batched_values = list(zip(chunk_id, batched_values))

            ds._tensors[f"/{key}"].disable_dynamicness()

            results = [
                self.upload_chunk.remote(el, key=key, ds=ds)
                for el in index_batched_values
            ]
            tasks.extend(results)

        results = ray.get(tasks)
        self.set_dynamic_shapes(results, ds)
        ds.commit()
        return ds
Пример #12
0
    def __setitem__(self, slice_, value):
        """| Sets a slice or slices with a value
        | Usage:

        >>> ds_view = ds[5:15]
        >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image
        """
        self.dataset._auto_checkout()

        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)
        slice_list = [0] + slice_list if isinstance(self.indexes,
                                                    int) else slice_list

        assign_value = get_value(value)
        schema_dict = self.dataset.schema
        if subpath[1:] in schema_dict.dict_.keys():
            schema_key = schema_dict.dict_.get(subpath[1:], None)
        else:
            for schema_key in subpath[1:].split("/"):
                schema_dict = schema_dict.dict_.get(schema_key, None)
                if not isinstance(schema_dict, SchemaDict):
                    schema_key = schema_dict
        if isinstance(schema_key, ClassLabel):
            assign_value = check_class_label(assign_value, schema_key)
        if isinstance(
                schema_key,
            (Text, bytes)) or (isinstance(assign_value, Iterable) and any(
                isinstance(val, str) for val in assign_value)):
            # handling strings and bytes
            assign_value = str_to_int(assign_value, self.dataset.tokenizer)

        if not subpath:
            raise ValueError("Can't assign to dataset sliced without key")
        elif subpath not in self.keys:
            raise KeyError(f"Key {subpath} not found in dataset")

        if not slice_list:
            slice_ = (slice(self.indexes[0], self.indexes[-1] +
                            1) if self.is_contiguous else self.indexes)
            if not isinstance(slice_, list):
                self.dataset._tensors[subpath][slice_] = assign_value
            else:
                for i, index in enumerate(slice_):
                    self.dataset._tensors[subpath][index] = assign_value[i]
        else:
            if isinstance(self.indexes, list):
                indexes = self.indexes[slice_list[0]]
                if self.is_contiguous and isinstance(indexes,
                                                     list) and indexes:
                    slice_list[0] = slice(indexes[0], indexes[-1] + 1)
                else:
                    slice_list[0] = indexes
            else:
                slice_list[0] = self.indexes

            if not isinstance(slice_list[0], list):
                self.dataset._tensors[subpath][slice_list] = assign_value
            else:
                for i, index in enumerate(slice_list[0]):
                    current_slice = [index] + slice_list[1:]
                    self.dataset._tensors[subpath][
                        current_slice] = assign_value[i]