예제 #1
0
    def __init__(
            self,
            dataset=None,
            lazy: bool = True,
            indexes=None,  # list or integer
    ):
        """Creates a DatasetView object for a subset of the Dataset.

        Parameters
        ----------
        dataset: hub.api.dataset.Dataset object
            The dataset whose DatasetView is being created
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()
        indexes: optional
            It can be either a list or an integer depending upon the slicing. Represents the indexes that the datasetview is representing.
        """
        if dataset is None:
            raise NoneValueException("dataset")
        if indexes is None:
            raise NoneValueException("indexes")

        self.dataset = dataset
        self.lazy = lazy
        self.indexes = indexes
        self.is_contiguous = False
        if isinstance(self.indexes, list) and self.indexes:
            self.is_contiguous = self.indexes[-1] - self.indexes[0] + 1 == len(
                self.indexes)
예제 #2
0
    def __init__(
        self,
        dataset=None,
        num_samples: int = None,
        offset: int = None,
        squeeze_dim: bool = False,
        lazy: bool = True,
    ):
        """Creates a DatasetView object for a subset of the Dataset.

        Parameters
        ----------
        dataset: hub.api.dataset.Dataset object
            The dataset whose DatasetView is being created
        num_samples: int
            The number of samples in this DatasetView
        offset: int
            The offset from which the DatasetView starts
        squeeze_dim: bool, optional
            For slicing with integers we would love to remove the first dimension to make it nicer
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()
        """
        if dataset is None:
            raise NoneValueException("dataset")
        if num_samples is None:
            raise NoneValueException("num_samples")
        if offset is None:
            raise NoneValueException("offset")

        self.dataset = dataset
        self.num_samples = num_samples
        self.offset = offset
        self.squeeze_dim = squeeze_dim
        self.lazy = lazy
예제 #3
0
    def __init__(
        self,
        dataset=None,
        num_samples=None,
        offset=None,
        squeeze_dim=False,
    ):
        """Creates a DatasetView object for a subset of the Dataset.

        Parameters
        ----------
        dataset: hub.api.dataset.Dataset object
            The dataset whose DatasetView is being created
        num_samples: int
            The number of samples in this DatasetView
        offset: int
            The offset from which the DatasetView starts
        squuze_dim: bool
            For slicing with integers we would love to remove the first dimension to make it nicer
        """
        if dataset is None:
            raise NoneValueException("dataset")
        if num_samples is None:
            raise NoneValueException("num_samples")
        if offset is None:
            raise NoneValueException("offset")

        self.dataset = dataset
        self.num_samples = num_samples
        self.offset = offset
        self.squeeze_dim = squeeze_dim
예제 #4
0
    def __init__(
        self,
        dataset=None,
        subpath=None,
        slice_=None,
        lazy: bool = True,
    ):
        """Creates a TensorView object for a particular tensor in the dataset

        Parameters
        ----------
        dataset: hub.api.dataset.Dataset object
            The dataset whose TensorView is being created
        subpath: str
            The full path to the particular Tensor in the Dataset
        slice_: optional
            The `slice_` of this Tensor that needs to be accessed
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()
        """

        if dataset is None:
            raise NoneValueException("dataset")
        if subpath is None:
            raise NoneValueException("subpath")

        self.dataset = dataset
        self.subpath = subpath
        self.lazy = lazy

        if isinstance(slice_, (int, slice)):
            self.slice_ = [slice_]
        elif isinstance(slice_, (tuple, list)):
            self.slice_ = list(slice_)
        self.nums = []
        self.offsets = []

        self.squeeze_dims = []
        for it in self.slice_:
            if isinstance(it, int):
                self.nums.append(1)
                self.offsets.append(it)
                self.squeeze_dims.append(True)
            elif isinstance(it, slice):
                ofs = it.start or 0
                num = it.stop - ofs if it.stop else None
                self.nums.append(num)
                self.offsets.append(ofs)
                self.squeeze_dims.append(False)
        self.nums[0] = (
            self.dataset.shape[0] - self.offsets[0]
            if self.nums[0] is None
            else self.nums[0]
        )
        self.dtype = self.dtype_from_path(subpath)
        self.shape = self.dataset._tensors[self.subpath].get_shape(self.slice_)
예제 #5
0
def test_exceptions():
    HubException()
    AuthenticationException()
    AuthorizationException(Response())
    AuthorizationException(Response(noerror=True))
    NotFoundException()
    BadRequestException(Response())
    BadRequestException(Response(noerror=True))
    OverLimitException()
    ServerException()
    BadGatewayException()
    GatewayTimeoutException()
    WaitTimeoutException()
    LockedException()
    HubDatasetNotFoundException("Hello")
    PermissionException("Hello")
    ShapeLengthException()
    ShapeArgumentNotFoundException()
    SchemaArgumentNotFoundException()
    ValueShapeError("Shape 1", "Shape 2")
    NoneValueException("Yahoo!")
    ModuleNotInstalledException("my_module")
    WrongUsernameException("usernameX")
    NotHubDatasetToOverwriteException()
    NotHubDatasetToAppendException()
    DynamicTensorNotFoundException()

    DynamicTensorShapeException("none")
    DynamicTensorShapeException("length")
    DynamicTensorShapeException("not_equal")
    DynamicTensorShapeException("another_cause")
예제 #6
0
    def __init__(
        self,
        url: str,
        mode: str = "a",
        safe_mode: bool = False,
        shape=None,
        schema=None,
        token=None,
        fs=None,
        fs_map=None,
        cache: int = 2**26,
        storage_cache: int = 2**28,
        lock_cache=True,
        tokenizer=None,
    ):
        """| Open a new or existing dataset for read/write

        Parameters
        ----------
        url: str
            The url where dataset is located/should be created
        mode: str, optional (default to "w")
            Python way to tell whether dataset is for read or write (ex. "r", "w", "a")
        safe_mode: bool, optional
            if dataset exists it cannot be rewritten in safe mode, otherwise it lets to write the first time
        shape: tuple, optional
            Tuple with (num_samples,) format, where num_samples is number of samples
        schema: optional
            Describes the data of a single sample. Hub schemas are used for that
            Required for 'a' and 'w' modes
        token: str or dict, optional
            If url is refering to a place where authorization is required,
            token is the parameter to pass the credentials, it can be filepath or dict
        fs: optional
        fs_map: optional
        cache: int, optional
            Size of the memory cache. Default is 64MB (2**26)
            if 0, False or None, then cache is not used
        storage_cache: int, optional
            Size of the storage cache. Default is 256MB (2**28)
            if 0, False or None, then storage cache is not used
        lock_cache: bool, optional
            Lock the cache for avoiding multiprocessing errors
        """

        shape = shape or (None, )
        if isinstance(shape, int):
            shape = [shape]
        if shape is not None:
            if len(tuple(shape)) != 1:
                raise ShapeLengthException
        if mode is None:
            raise NoneValueException("mode")

        if not cache:
            storage_cache = False

        self.url = url
        self.token = token
        self.mode = mode
        self.tokenizer = tokenizer

        self._fs, self._path = ((fs, url) if fs else get_fs_and_path(
            self.url, token=token))
        self.cache = cache
        self._storage_cache = storage_cache
        self.lock_cache = lock_cache
        self.verison = "1.x"

        needcreate = self._check_and_prepare_dir()
        fs_map = fs_map or get_storage_map(self._fs,
                                           self._path,
                                           cache,
                                           lock=lock_cache,
                                           storage_cache=storage_cache)
        self._fs_map = fs_map

        if safe_mode and not needcreate:
            mode = "r"
        self.username = None
        self.dataset_name = None
        if not needcreate:
            self.meta = json.loads(fs_map["meta.json"].decode("utf-8"))
            self.shape = tuple(self.meta["shape"])
            self.schema = hub.schema.deserialize.deserialize(
                self.meta["schema"])
            self._flat_tensors = tuple(flatten(self.schema))
            self._tensors = dict(self._open_storage_tensors())
        else:
            if shape[0] is None:
                raise ShapeArgumentNotFoundException()
            if schema is None:
                raise SchemaArgumentNotFoundException()
            try:
                if shape is None:
                    raise ShapeArgumentNotFoundException()
                if schema is None:
                    raise SchemaArgumentNotFoundException()
                self.schema: HubSchema = featurify(schema)
                self.shape = tuple(shape)
                self.meta = self._store_meta()
                self._flat_tensors = tuple(flatten(self.schema))
                self._tensors = dict(self._generate_storage_tensors())
                self.flush()
            except Exception as e:
                try:
                    self.close()
                except Exception:
                    pass
                self._fs.rm(self._path, recursive=True)
                logger.error("Deleting the dataset " + traceback.format_exc() +
                             str(e))
                raise

        if needcreate and (self._path.startswith("s3://snark-hub-dev/")
                           or self._path.startswith("s3://snark-hub/")):
            subpath = self._path[5:]
            spl = subpath.split("/")
            if len(spl) < 4:
                raise ValueError("Invalid Path for dataset")
            self.username = spl[-2]
            self.dataset_name = spl[-1]
            HubControlClient().create_dataset_entry(self.username,
                                                    self.dataset_name,
                                                    self.meta)