Пример #1
0
    def upload(self,
               results,
               ds: Dataset,
               token: dict,
               progressbar: bool = True):
        """Batchified upload of results
        For each tensor batchify based on its chunk and upload
        If tensor is dynamic then still upload element by element
        For dynamic tensors, it disable dynamicness and then enables it back

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """

        for key, value in results.items():

            length = ds[key].chunksize[0]
            value = str_to_int(value, ds.dataset.tokenizer)

            if length == 0:
                length = 1

            batched_values = batchify(value, length)

            def upload_chunk(i_batch):
                i, batch = i_batch
                batch_length = len(batch)
                if batch_length != 1:
                    ds[key, i * length:i * length + batch_length] = batch
                else:
                    ds[key, i * length] = batch[0]

            index_batched_values = list(
                zip(list(range(len(batched_values))), batched_values))

            # Disable dynamic arrays
            ds.dataset._tensors[f"/{key}"].disable_dynamicness()
            list(self.map(upload_chunk, index_batched_values))

            # Enable and rewrite shapes
            if ds.dataset._tensors[f"/{key}"].is_dynamic:
                ds.dataset._tensors[f"/{key}"].enable_dynamicness()
                [
                    ds.dataset._tensors[f"/{key}"].set_shape([i + ds.offset],
                                                             v)
                    for i, v in enumerate(value)
                ]

        ds.commit()
        return ds
Пример #2
0
    def upload(self,
               results,
               ds: Dataset,
               token: dict,
               progressbar: bool = True):
        """Batchified upload of results.
        For each tensor batchify based on its chunk and upload.
        If tensor is dynamic then still upload element by element.
        For dynamic tensors, it disable dynamicness and then enables it back.

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """

        for key, value in results.items():

            chunk = ds[key].chunksize[0]
            chunk = 1 if chunk == 0 else chunk
            value = get_value(value)
            value = str_to_int(value, ds.dataset.tokenizer)

            num_chunks = math.ceil(len(value) / (chunk * self.workers))
            length = num_chunks * chunk if self.workers != 1 else len(value)
            batched_values = batchify(value, length)

            def upload_chunk(i_batch):
                i, batch = i_batch
                length = len(batch)
                slice_ = slice(i * length, (i + 1) * length)
                ds[key, slice_] = batch

            index_batched_values = list(
                zip(list(range(len(batched_values))), batched_values))

            # Disable dynamic arrays
            ds.dataset._tensors[f"/{key}"].disable_dynamicness()
            list(self.map(upload_chunk, index_batched_values))
            offset = ds.indexes[
                0]  # here ds.indexes will always be a contiguous list as obtained after slicing

            # Enable and rewrite shapes
            if ds.dataset._tensors[f"/{key}"].is_dynamic:
                ds.dataset._tensors[f"/{key}"].enable_dynamicness()
                ds.dataset._tensors[f"/{key}"].set_shape(
                    [slice(offset, offset + len(value))], value)

        ds.commit()
        return ds
Пример #3
0
    def upload(self, results, url: str, token: dict, progressbar: bool = True):
        """Batchified upload of results
        For each tensor batchify based on its chunk and upload
        If tensor is dynamic then still upload element by element

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """
        shape = (len(list(results.values())[0]), )
        ds = Dataset(
            url,
            mode="w",
            shape=shape,  # unkownn
            schema=self.schema,
            token=token,
            cache=False,
        )

        tasks = []
        for key, value in results.items():
            length = ds[key].chunksize[0]
            batched_values = batchify(value, length)

            chunk_id = list(range(len(batched_values)))
            index_batched_values = list(zip(chunk_id, batched_values))
            results = [
                self.upload_chunk.remote(el, key=key, ds=ds)
                for el in index_batched_values
            ]
            tasks.extend(results)

        ray.get(tasks)
        ds.commit()
        return ds
Пример #4
0
Файл: ray.py Проект: x213212/Hub
    def upload(
        self,
        results,
        url: str,
        token: dict,
        progressbar: bool = True,
        public: bool = True,
    ):
        """Batchified upload of results.
        For each tensor batchify based on its chunk and upload.
        If tensor is dynamic then still upload element by element.

        Parameters
        ----------
        dataset: hub.Dataset
            Dataset object that should be written to
        results:
            Output of transform function
        progressbar: bool
        public: bool, optional
            only applicable if using hub storage, ignored otherwise
            setting this to False allows only the user who created it to access the dataset and
            the dataset won't be visible in the visualizer to the public
        Returns
        ----------
        ds: hub.Dataset
            Uploaded dataset
        """
        if len(list(results.values())) == 0:
            shape = (0, )
        else:
            shape = (len(list(results.values())[0]), )

        ds = Dataset(
            url,
            mode="w",
            shape=shape,
            schema=self.schema,
            token=token,
            cache=False,
            public=public,
        )

        tasks = []
        for key, value in results.items():

            length = ds[key].chunksize[0]
            value = get_value(value)
            value = str_to_int(value, ds.tokenizer)
            batched_values = batchify(value, length)
            chunk_id = list(range(len(batched_values)))
            index_batched_values = list(zip(chunk_id, batched_values))

            ds._tensors[f"/{key}"].disable_dynamicness()

            results = [
                self.upload_chunk.remote(el, key=key, ds=ds)
                for el in index_batched_values
            ]
            tasks.extend(results)

        results = ray.get(tasks)
        self.set_dynamic_shapes(results, ds)
        ds.commit()
        return ds