Exemplo n.º 1
0
 def test_mnist(self):
     with self.gcs_access():
         mnist = tfds.image_classification.MNIST(
             data_dir=gcs_utils.gcs_path('datasets'))
         example = next(
             tfds.as_numpy(mnist.as_dataset(split='train').take(1)))
     _ = example['image'], example['label']
Exemplo n.º 2
0
 def initialize_from_bucket(self):
   """Initialize DatasetInfo from GCS bucket info files."""
   # In order to support Colab, we use the HTTP GCS API to access the metadata
   # files. They are copied locally and then loaded.
   tmp_dir = tempfile.mkdtemp("tfds")
   data_files = gcs_utils.gcs_dataset_info_files(self.full_name)
   if not data_files:
     return
   logging.info("Load pre-computed DatasetInfo (eg: splits, num examples,...) "
                "from GCS: %s", self.full_name)
   for fname in data_files:
     out_fname = os.path.join(tmp_dir, os.path.basename(fname))
     tf.io.gfile.copy(gcs_utils.gcs_path(fname), out_fname)
   self.read_from_directory(tmp_dir)
Exemplo n.º 3
0
def load(
    name: str,
    *,
    split: Optional[Tree[splits_lib.Split]] = None,
    data_dir: Optional[str] = None,
    batch_size: Optional[int] = None,
    shuffle_files: bool = False,
    download: bool = True,
    as_supervised: bool = False,
    decoders: Optional[TreeDict[decode.Decoder]] = None,
    read_config: Optional[read_config_lib.ReadConfig] = None,
    with_info: bool = False,
    builder_kwargs: Optional[Dict[str, Any]] = None,
    download_and_prepare_kwargs: Optional[Dict[str, Any]] = None,
    as_dataset_kwargs: Optional[Dict[str, Any]] = None,
    try_gcs: bool = False,
):
  # pylint: disable=line-too-long
  """Loads the named dataset into a `tf.data.Dataset`.

  `tfds.load` is a convenience method that:

  1. Fetch the `tfds.core.DatasetBuilder` by name:

  ```python
  builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs)
  ```

  2. Generate the data (when `download=True`):

  ```python
  builder.download_and_prepare(**download_and_prepare_kwargs)
  ```

  3. Load the `tf.data.Dataset` object:

  ```python
  ds = builder.as_dataset(
      split=split,
      as_supervised=as_supervised,
      shuffle_files=shuffle_files,
      read_config=read_config,
      decoders=decoders,
      **as_dataset_kwargs,
  )
  ```

  See: https://www.tensorflow.org/datasets/overview#load_a_dataset for more
  examples.

  If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s,
  you can pass the return value to `tfds.as_numpy`.

  **Warning**: calling this function might potentially trigger the download
  of hundreds of GiB to disk. Refer to the `download` argument.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the snake case
      version of the class name). This can be either `"dataset_name"` or
      `"dataset_name/config_name"` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `"foo_bar/a=True,b=3"` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to
      use the `"zoo"` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    split: Which split of the data to load (e.g. `'train'`, `'test'`
      `['train', 'test']`, `'train[80%:]'`,...). See our
      [split API guide](https://www.tensorflow.org/datasets/splits).
      If `None`, will return all splits in a `Dict[Split, tf.data.Dataset]`
    data_dir: `str`, directory to read/write data. Defaults to the value of
      the environment variable TFDS_DATA_DIR, if set, otherwise falls back to
      "~/tensorflow_datasets".
    batch_size: `int`, if set, add a batch dimension to examples. Note that
      variable length features will be 0-padded. If
      `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
    shuffle_files: `bool`, whether to shuffle the input files.
      Defaults to `False`.
    download: `bool` (optional), whether to call
      `tfds.core.DatasetBuilder.download_and_prepare`
      before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is
      expected to be in `data_dir`. If `True` and the data is already in
      `data_dir`, `download_and_prepare` is a no-op.
    as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
      will have a 2-tuple structure `(input, label)` according to
      `builder.info.supervised_keys`. If `False`, the default,
      the returned `tf.data.Dataset` will have a dictionary with all the
      features.
    decoders: Nested dict of `Decoder` objects which allow to customize the
      decoding. The structure should match the feature structure, but only
      customized feature keys need to be present. See
      [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md)
      for more info.
    read_config: `tfds.ReadConfig`, Additional options to configure the
      input pipeline (e.g. seed, num parallel reads,...).
    with_info: `bool`, if True, tfds.load will return the tuple
      (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated
      with the builder.
    builder_kwargs: `dict` (optional), keyword arguments to be passed to the
      `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed
      through by default.
    download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to
      `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow
      to control where to download and extract the cached data. If not set,
      cache_dir and manual_dir will automatically be deduced from data_dir.
    as_dataset_kwargs: `dict` (optional), keyword arguments passed to
      `tfds.core.DatasetBuilder.as_dataset`.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.

  Returns:
    ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a
      `dict<key: tfds.Split, value: tf.data.Dataset>`. If `batch_size=-1`,
      these will be full datasets as `tf.Tensor`s.
    ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load`
      will return a tuple `(ds, ds_info)` containing dataset information
      (version, features, splits, num_examples,...). Note that the `ds_info`
      object documents the entire dataset, regardless of the `split` requested.
      Split-specific information is available in `ds_info.splits`.
  """
  # pylint: enable=line-too-long
  if builder_kwargs is None:
    builder_kwargs = {}

  # Set data_dir
  if try_gcs and gcs_utils.is_dataset_on_gcs(name):
    data_dir = gcs_utils.gcs_path("datasets")

  dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
  if download:
    download_and_prepare_kwargs = download_and_prepare_kwargs or {}
    dbuilder.download_and_prepare(**download_and_prepare_kwargs)

  if as_dataset_kwargs is None:
    as_dataset_kwargs = {}
  as_dataset_kwargs = dict(as_dataset_kwargs)
  as_dataset_kwargs.setdefault("split", split)
  as_dataset_kwargs.setdefault("as_supervised", as_supervised)
  as_dataset_kwargs.setdefault("batch_size", batch_size)
  as_dataset_kwargs.setdefault("decoders", decoders)
  as_dataset_kwargs.setdefault("shuffle_files", shuffle_files)
  as_dataset_kwargs.setdefault("read_config", read_config)

  ds = dbuilder.as_dataset(**as_dataset_kwargs)
  if with_info:
    return ds, dbuilder.info
  return ds
Exemplo n.º 4
0
def builder(name: str,
            *,
            try_gcs: bool = False,
            **builder_kwargs: Any) -> dataset_builder.DatasetBuilder:
    """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_kwargs: `dict` of keyword arguments passed to the
      `tfds.core.DatasetBuilder`.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'})
    ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)

    # `try_gcs` currently only support non-community datasets
    if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name):
        data_dir = builder_kwargs.get('data_dir')
        if data_dir:
            raise ValueError(
                f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` '
                'explicitly set')
        builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets')

    # Community datasets
    if ns_name:
        raise NotImplementedError

    # First check whether code exists or not (imported datasets)
    try:
        cls = builder_cls(builder_name)
    except registered.DatasetNotFoundError as e:
        cls = None  # Class not found
        not_found_error = e  # Save the exception to eventually reraise

    # Eventually try loading from files first
    if _try_load_from_files_first(cls, **builder_kwargs):
        try:
            b = read_only_builder.builder_from_files(builder_name,
                                                     **builder_kwargs)
            return b
        except registered.DatasetNotFoundError as e:
            pass

    # If code exists and loading from files was skipped (e.g. files not found),
    # load from the source code.
    if cls:
        with py_utils.try_reraise(
                prefix=f'Failed to construct dataset {name}: '):
            return cls(**builder_kwargs)  # pytype: disable=not-instantiable

    # If neither the code nor the files are found, raise DatasetNotFoundError
    raise not_found_error
Exemplo n.º 5
0
def builder(
    name: str,
    *,
    data_dir: Optional[str] = None,
    try_gcs: bool = False,
    **builder_init_kwargs: Any
) -> dataset_builder.DatasetBuilder:
  """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    data_dir: Path to the dataset(s). See `tfds.load` for more information.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_init_kwargs: `dict` of keyword arguments passed to the
      `DatasetBuilder`. These will override keyword arguments passed in `name`,
      if any.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
  builder_name, builder_kwargs = dataset_name_and_kwargs_from_name_str(name)
  # Set data_dir.
  if try_gcs and gcs_utils.is_dataset_on_gcs(builder_name):
    data_dir = gcs_utils.gcs_path("datasets")

  # Try loading the code (if it exists)
  try:
    cls = builder_cls(builder_name)
  except DatasetNotFoundError as e:
    if e.is_abstract:
      raise  # Abstract can't be instanciated neither from code nor files.
    cls = None  # Class not found
    not_found_error = e  # Save the exception to eventually reraise

  version_explicitly_given = "version" in builder_kwargs

  # Try loading from files first:
  # * If code not present.
  # * If version explicitly given (backward/forward compatibility).
  # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest',
  # custom config,...), read from generation code.
  if (not cls or version_explicitly_given) and not builder_init_kwargs:
    builder_dir = find_builder_dir(name, data_dir=data_dir)
    if builder_dir is not None:  # A generated dataset was found on disk
      return read_only_builder.builder_from_directory(builder_dir)

  # If loading from files was skipped (e.g. files not found), load from the
  # source code.
  if cls:
    with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "):
      return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs)  # pytype: disable=not-instantiable

  # If neither the code nor the files are found, raise DatasetNotFoundError
  raise not_found_error