예제 #1
0
    def _maybe_log_gcs_data_dir(self):
        """If data is on GCS, set _data_dir to GCS path."""
        if not gcs_utils.is_dataset_on_gcs(self.info.full_name):
            return

        gcs_path = os.path.join(constants.GCS_DATA_DIR, self.info.full_name)
        msg = GCS_HOSTED_MSG.format(name=self.name,
                                    gcs_path=gcs_path,
                                    local_data_dir_no_version=os.path.split(
                                        self._data_dir)[0])
        logging.info(msg)
예제 #2
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(self.info.size_in_bytes,
                                               directory=self._data_dir_root):
            raise IOError("Not enough disk space. Needed: %s" %
                          units.size_str(self.info.size_in_bytes))
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and bool(self.info.splits.total_num_examples)):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.size_in_bytes = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
예제 #3
0
 def is_dataset_accessible(self):
     # Re-enable GCS access. TestCase disables it.
     with self.gcs_access():
         self.assertTrue(gcs_utils.is_dataset_on_gcs("mnist/1.0.0"))
예제 #4
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        # Disable `download_and_prepare` (internally, we are still
        # allowing Py2 for the `dataset_builder_tests.py` & cie
        if _is_py2_download_and_prepare_disabled and six.PY2:
            raise NotImplementedError(
                "TFDS has dropped `builder.download_and_prepare` support for "
                "Python 2. Please update your code to Python 3.")

        if self.version.tfds_version_to_prepare:
            available_to_prepare = ", ".join(
                str(v) for v in self.versions if not v.tfds_version_to_prepare)
            raise AssertionError(
                "The version of the dataset you are trying to use ({}:{}) can only "
                "be generated using TFDS code synced @ {} or earlier. Either sync to "
                "that version of TFDS to first prepare the data or use another "
                "version of the dataset (available for `download_and_prepare`: "
                "{}).".format(self.name, self.version,
                              self.version.tfds_version_to_prepare,
                              available_to_prepare))

        # Only `cls.VERSION` or `experimental_latest` versions can be generated.
        # Otherwise, users may accidentally generate an old version using the
        # code from newer versions.
        installable_versions = {
            str(v)
            for v in (self.canonical_version, max(self.versions))
        }
        if str(self.version) not in installable_versions:
            msg = (
                "The version of the dataset you are trying to use ({}) is too "
                "old for this version of TFDS so cannot be generated.").format(
                    self.info.full_name)
            if self.version.tfds_version_to_prepare:
                msg += (
                    "{} can only be generated using TFDS code synced @ {} or earlier "
                    "Either sync to that version of TFDS to first prepare the data or "
                    "use another version of the dataset. ").format(
                        self.version, self.version.tfds_version_to_prepare)
            else:
                msg += (
                    "Either sync to a previous version of TFDS to first prepare the "
                    "data or use another version of the dataset. ")
            msg += "Available for `download_and_prepare`: {}".format(
                list(sorted(installable_versions)))
            raise ValueError(msg)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))

        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(
                self.info.dataset_size + self.info.download_size,
                directory=self._data_dir_root):
            raise IOError(
                "Not enough disk space. Needed: {} (download: {}, generated: {})"
                .format(
                    units.size_str(self.info.dataset_size +
                                   self.info.download_size),
                    units.size_str(self.info.download_size),
                    units.size_str(self.info.dataset_size),
                ))
        self._log_download_bytes()

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with utils.incomplete_dir(self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Skip statistics computation if tfdv isn't present
                    try:
                        import tensorflow_data_validation  # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import  # pytype: disable=import-error
                        skip_stats_computation = False
                    except ImportError:
                        skip_stats_computation = True

                    splits = list(self.info.splits.values())
                    statistics_already_computed = bool(
                        splits and splits[0].statistics.num_examples)
                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (skip_stats_computation or download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and statistics_already_computed):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.download_size = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
예제 #5
0
 def test_is_dataset_accessible(self):
     # Re-enable GCS access. TestCase disables it.
     with self.gcs_access():
         is_ds_on_gcs = gcs_utils.is_dataset_on_gcs('mnist/1.0.0')
         self.assertTrue(is_ds_on_gcs)
예제 #6
0
def load(name,
         split=None,
         data_dir=None,
         batch_size=None,
         shuffle_files=False,
         download=True,
         as_supervised=False,
         decoders=None,
         read_config=None,
         with_info=False,
         builder_kwargs=None,
         download_and_prepare_kwargs=None,
         as_dataset_kwargs=None,
         try_gcs=False):
  # pylint: disable=line-too-long
  """Loads the named dataset into a `tf.data.Dataset`.

  If `split=None` (the default), returns all splits for the dataset. Otherwise,
  returns the specified split.

  `load` is a convenience method that fetches the `tfds.core.DatasetBuilder` by
  string name, optionally calls `DatasetBuilder.download_and_prepare`
  (if `download=True`), and then calls `DatasetBuilder.as_dataset`.
  This is roughly equivalent to:

  ```
  builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs)
  if download:
    builder.download_and_prepare(**download_and_prepare_kwargs)
  ds = builder.as_dataset(
      split=split, as_supervised=as_supervised, **as_dataset_kwargs)
  if with_info:
    return ds, builder.info
  return ds
  ```

  If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s,
  you can pass the return value to `tfds.as_numpy`.

  Callers must pass arguments as keyword arguments.

  **Warning**: calling this function might potentially trigger the download
  of hundreds of GiB to disk. Refer to the `download` argument.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the snake case
      version of the class name). This can be either `"dataset_name"` or
      `"dataset_name/config_name"` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `"foo_bar/a=True,b=3"` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to
      use the `"zoo"` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    split: `tfds.Split` or `str`, which split of the data to load. If None,
      will return a `dict` with all splits (typically `tfds.Split.TRAIN` and
      `tfds.Split.TEST`).
    data_dir: `str` (optional), directory to read/write data.
      Defaults to "~/tensorflow_datasets".
    batch_size: `int`, if set, add a batch dimension to examples. Note that
      variable length features will be 0-padded. If
      `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
    shuffle_files: `bool`, whether to shuffle the input files.
      Defaults to `False`.
    download: `bool` (optional), whether to call
      `tfds.core.DatasetBuilder.download_and_prepare`
      before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is
      expected to be in `data_dir`. If `True` and the data is already in
      `data_dir`, `download_and_prepare` is a no-op.
    as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
      will have a 2-tuple structure `(input, label)` according to
      `builder.info.supervised_keys`. If `False`, the default,
      the returned `tf.data.Dataset` will have a dictionary with all the
      features.
    decoders: Nested dict of `Decoder` objects which allow to customize the
      decoding. The structure should match the feature structure, but only
      customized feature keys need to be present. See
      [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md)
      for more info.
    read_config: `tfds.ReadConfig`, Additional options to configure the
      input pipeline (e.g. seed, num parallel reads,...).
    with_info: `bool`, if True, tfds.load will return the tuple
      (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated
      with the builder.
    builder_kwargs: `dict` (optional), keyword arguments to be passed to the
      `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed
      through by default.
    download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to
      `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow
      to control where to download and extract the cached data. If not set,
      cache_dir and manual_dir will automatically be deduced from data_dir.
    as_dataset_kwargs: `dict` (optional), keyword arguments passed to
      `tfds.core.DatasetBuilder.as_dataset`.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.

  Returns:
    ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a
      `dict<key: tfds.Split, value: tfds.data.Dataset>`. If `batch_size=-1`,
      these will be full datasets as `tf.Tensor`s.
    ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load`
      will return a tuple `(ds, ds_info)` containing dataset information
      (version, features, splits, num_examples,...). Note that the `ds_info`
      object documents the entire dataset, regardless of the `split` requested.
      Split-specific information is available in `ds_info.splits`.
  """
  # pylint: enable=line-too-long

  name, name_builder_kwargs = _dataset_name_and_kwargs_from_name_str(name)
  name_builder_kwargs.update(builder_kwargs or {})
  builder_kwargs = name_builder_kwargs

  # Set data_dir
  if try_gcs and gcs_utils.is_dataset_on_gcs(name):
    data_dir = constants.GCS_DATA_DIR
  elif data_dir is None:
    data_dir = constants.DATA_DIR

  dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
  if download:
    download_and_prepare_kwargs = download_and_prepare_kwargs or {}
    dbuilder.download_and_prepare(**download_and_prepare_kwargs)

  if as_dataset_kwargs is None:
    as_dataset_kwargs = {}
  as_dataset_kwargs = dict(as_dataset_kwargs)
  as_dataset_kwargs.setdefault("split", split)
  as_dataset_kwargs.setdefault("as_supervised", as_supervised)
  as_dataset_kwargs.setdefault("batch_size", batch_size)
  as_dataset_kwargs.setdefault("decoders", decoders)
  as_dataset_kwargs.setdefault("shuffle_files", shuffle_files)
  as_dataset_kwargs.setdefault("read_config", read_config)

  ds = dbuilder.as_dataset(**as_dataset_kwargs)
  if with_info:
    return ds, dbuilder.info
  return ds
예제 #7
0
def load(name,
         split=None,
         data_dir=None,
         batch_size=1,
         in_memory=None,
         download=True,
         as_supervised=False,
         with_info=False,
         builder_kwargs=None,
         download_and_prepare_kwargs=None,
         as_dataset_kwargs=None,
         try_gcs=False):
    """Loads the named dataset into a `tf.data.Dataset`.

  If `split=None` (the default), returns all splits for the dataset. Otherwise,
  returns the specified split.

  `load` is a convenience method that fetches the `tfds.core.DatasetBuilder` by
  string name, optionally calls `DatasetBuilder.download_and_prepare`
  (if `download=True`), and then calls `DatasetBuilder.as_dataset`.
  This is roughly equivalent to:

  ```
  builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs)
  if download:
    builder.download_and_prepare(**download_and_prepare_kwargs)
  ds = builder.as_dataset(
      split=split, as_supervised=as_supervised, **as_dataset_kwargs)
  if with_info:
    return ds, builder.info
  return ds
  ```

  If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s,
  you can pass the return value to `tfds.as_numpy`.

  Callers must pass arguments as keyword arguments.

  **Warning**: calling this function might potentially trigger the download
  of hundreds of GiB to disk. Refer to the `download` argument.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the snake case
      version of the class name). This can be either `"dataset_name"` or
      `"dataset_name/config_name"` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `"foo_bar/a=True,b=3"` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to
      use the `"zoo"` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    split: `tfds.Split` or `str`, which split of the data to load. If None,
      will return a `dict` with all splits (typically `tfds.Split.TRAIN` and
      `tfds.Split.TEST`).
    data_dir: `str` (optional), directory to read/write data.
      Defaults to "~/tensorflow_datasets".
    batch_size: `int`, set to > 1 to get batches of examples. Note that
      variable length features will be 0-padded. If
      `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
    in_memory: `bool`, if `True`, loads the dataset in memory which
      increases iteration speeds. Note that if `True` and the dataset has
      unknown dimensions, the features will be padded to the maximum
      size across the dataset. By default (when `None`), will load the
      dataset in memory if the size is <1GB and all feature dimensions are
      statically known.
    download: `bool` (optional), whether to call
      `tfds.core.DatasetBuilder.download_and_prepare`
      before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is
      expected to be in `data_dir`. If `True` and the data is already in
      `data_dir`, `download_and_prepare` is a no-op.
    as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
      will have a 2-tuple structure `(input, label)` according to
      `builder.info.supervised_keys`. If `False`, the default,
      the returned `tf.data.Dataset` will have a dictionary with all the
      features.
    with_info: `bool`, if True, tfds.load will return the tuple
      (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated
      with the builder.
    builder_kwargs: `dict` (optional), keyword arguments to be passed to the
      `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed
      through by default.
    download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to
      `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow
      to control where to download and extract the cached data. If not set,
      cache_dir and manual_dir will automatically be deduced from data_dir.
    as_dataset_kwargs: `dict` (optional), keyword arguments passed to
      `tfds.core.DatasetBuilder.as_dataset`. `split` will be passed through by
      default. Example: `{'shuffle_files': True}`.
      Note that shuffle_files is False by default unless
      `split == tfds.Split.TRAIN`.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.

  Returns:
    ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a
      `dict<key: tfds.Split, value: tfds.data.Dataset>`. If `batch_size=-1`,
      these will be full datasets as `tf.Tensor`s.
    ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load`
      will return a tuple `(ds, ds_info)` containing dataset information
      (version, features, splits, num_examples,...). Note that the `ds_info`
      object documents the entire dataset, regardless of the `split` requested.
      Split-specific information is available in `ds_info.splits`.
  """
    name, name_builder_kwargs = _dataset_name_and_kwargs_from_name_str(name)
    name_builder_kwargs.update(builder_kwargs or {})
    builder_kwargs = name_builder_kwargs

    # Set data_dir
    if try_gcs and gcs_utils.is_dataset_on_gcs(name):
        data_dir = constants.GCS_DATA_DIR
    elif data_dir is None:
        data_dir = constants.DATA_DIR

    dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
    if download:
        download_and_prepare_kwargs = download_and_prepare_kwargs or {}
        dbuilder.download_and_prepare(**download_and_prepare_kwargs)

    if as_dataset_kwargs is None:
        as_dataset_kwargs = {}
    as_dataset_kwargs = dict(as_dataset_kwargs)
    as_dataset_kwargs["split"] = split
    as_dataset_kwargs["as_supervised"] = as_supervised
    as_dataset_kwargs["batch_size"] = batch_size
    as_dataset_kwargs["in_memory"] = in_memory

    ds = dbuilder.as_dataset(**as_dataset_kwargs)
    if with_info:
        return ds, dbuilder.info
    return ds
예제 #8
0
def load(
    name: str,
    *,
    split: Optional[Tree[splits_lib.Split]] = None,
    data_dir: Optional[str] = None,
    batch_size: Optional[int] = None,
    shuffle_files: bool = False,
    download: bool = True,
    as_supervised: bool = False,
    decoders: Optional[TreeDict[decode.Decoder]] = None,
    read_config: Optional[read_config_lib.ReadConfig] = None,
    with_info: bool = False,
    builder_kwargs: Optional[Dict[str, Any]] = None,
    download_and_prepare_kwargs: Optional[Dict[str, Any]] = None,
    as_dataset_kwargs: Optional[Dict[str, Any]] = None,
    try_gcs: bool = False,
):
  # pylint: disable=line-too-long
  """Loads the named dataset into a `tf.data.Dataset`.

  `tfds.load` is a convenience method that:

  1. Fetch the `tfds.core.DatasetBuilder` by name:

  ```python
  builder = tfds.builder(name, data_dir=data_dir, **builder_kwargs)
  ```

  2. Generate the data (when `download=True`):

  ```python
  builder.download_and_prepare(**download_and_prepare_kwargs)
  ```

  3. Load the `tf.data.Dataset` object:

  ```python
  ds = builder.as_dataset(
      split=split,
      as_supervised=as_supervised,
      shuffle_files=shuffle_files,
      read_config=read_config,
      decoders=decoders,
      **as_dataset_kwargs,
  )
  ```

  See: https://www.tensorflow.org/datasets/overview#load_a_dataset for more
  examples.

  If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s,
  you can pass the return value to `tfds.as_numpy`.

  **Warning**: calling this function might potentially trigger the download
  of hundreds of GiB to disk. Refer to the `download` argument.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the snake case
      version of the class name). This can be either `"dataset_name"` or
      `"dataset_name/config_name"` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `"foo_bar/a=True,b=3"` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `"foo_bar/zoo/a=True,b=3"` to
      use the `"zoo"` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    split: Which split of the data to load (e.g. `'train'`, `'test'`
      `['train', 'test']`, `'train[80%:]'`,...). See our
      [split API guide](https://www.tensorflow.org/datasets/splits).
      If `None`, will return all splits in a `Dict[Split, tf.data.Dataset]`
    data_dir: `str`, directory to read/write data. Defaults to the value of
      the environment variable TFDS_DATA_DIR, if set, otherwise falls back to
      "~/tensorflow_datasets".
    batch_size: `int`, if set, add a batch dimension to examples. Note that
      variable length features will be 0-padded. If
      `batch_size=-1`, will return the full dataset as `tf.Tensor`s.
    shuffle_files: `bool`, whether to shuffle the input files.
      Defaults to `False`.
    download: `bool` (optional), whether to call
      `tfds.core.DatasetBuilder.download_and_prepare`
      before calling `tf.DatasetBuilder.as_dataset`. If `False`, data is
      expected to be in `data_dir`. If `True` and the data is already in
      `data_dir`, `download_and_prepare` is a no-op.
    as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
      will have a 2-tuple structure `(input, label)` according to
      `builder.info.supervised_keys`. If `False`, the default,
      the returned `tf.data.Dataset` will have a dictionary with all the
      features.
    decoders: Nested dict of `Decoder` objects which allow to customize the
      decoding. The structure should match the feature structure, but only
      customized feature keys need to be present. See
      [the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md)
      for more info.
    read_config: `tfds.ReadConfig`, Additional options to configure the
      input pipeline (e.g. seed, num parallel reads,...).
    with_info: `bool`, if True, tfds.load will return the tuple
      (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated
      with the builder.
    builder_kwargs: `dict` (optional), keyword arguments to be passed to the
      `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed
      through by default.
    download_and_prepare_kwargs: `dict` (optional) keyword arguments passed to
      `tfds.core.DatasetBuilder.download_and_prepare` if `download=True`. Allow
      to control where to download and extract the cached data. If not set,
      cache_dir and manual_dir will automatically be deduced from data_dir.
    as_dataset_kwargs: `dict` (optional), keyword arguments passed to
      `tfds.core.DatasetBuilder.as_dataset`.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.

  Returns:
    ds: `tf.data.Dataset`, the dataset requested, or if `split` is None, a
      `dict<key: tfds.Split, value: tf.data.Dataset>`. If `batch_size=-1`,
      these will be full datasets as `tf.Tensor`s.
    ds_info: `tfds.core.DatasetInfo`, if `with_info` is True, then `tfds.load`
      will return a tuple `(ds, ds_info)` containing dataset information
      (version, features, splits, num_examples,...). Note that the `ds_info`
      object documents the entire dataset, regardless of the `split` requested.
      Split-specific information is available in `ds_info.splits`.
  """
  # pylint: enable=line-too-long
  if builder_kwargs is None:
    builder_kwargs = {}

  # Set data_dir
  if try_gcs and gcs_utils.is_dataset_on_gcs(name):
    data_dir = gcs_utils.gcs_path("datasets")

  dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
  if download:
    download_and_prepare_kwargs = download_and_prepare_kwargs or {}
    dbuilder.download_and_prepare(**download_and_prepare_kwargs)

  if as_dataset_kwargs is None:
    as_dataset_kwargs = {}
  as_dataset_kwargs = dict(as_dataset_kwargs)
  as_dataset_kwargs.setdefault("split", split)
  as_dataset_kwargs.setdefault("as_supervised", as_supervised)
  as_dataset_kwargs.setdefault("batch_size", batch_size)
  as_dataset_kwargs.setdefault("decoders", decoders)
  as_dataset_kwargs.setdefault("shuffle_files", shuffle_files)
  as_dataset_kwargs.setdefault("read_config", read_config)

  ds = dbuilder.as_dataset(**as_dataset_kwargs)
  if with_info:
    return ds, dbuilder.info
  return ds
예제 #9
0
def builder(name: str,
            *,
            try_gcs: bool = False,
            **builder_kwargs: Any) -> dataset_builder.DatasetBuilder:
    """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_kwargs: `dict` of keyword arguments passed to the
      `tfds.core.DatasetBuilder`.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
    # 'kaggle:my_dataset:1.0.0' -> ('kaggle', 'my_dataset', {'version': '1.0.0'})
    ns_name, builder_name, builder_kwargs = naming.parse_builder_name_kwargs(
        name, **builder_kwargs)

    # `try_gcs` currently only support non-community datasets
    if try_gcs and not ns_name and gcs_utils.is_dataset_on_gcs(builder_name):
        data_dir = builder_kwargs.get('data_dir')
        if data_dir:
            raise ValueError(
                f'Cannot have both `try_gcs=True` and `data_dir={data_dir}` '
                'explicitly set')
        builder_kwargs['data_dir'] = gcs_utils.gcs_path('datasets')

    # Community datasets
    if ns_name:
        raise NotImplementedError

    # First check whether code exists or not (imported datasets)
    try:
        cls = builder_cls(builder_name)
    except registered.DatasetNotFoundError as e:
        cls = None  # Class not found
        not_found_error = e  # Save the exception to eventually reraise

    # Eventually try loading from files first
    if _try_load_from_files_first(cls, **builder_kwargs):
        try:
            b = read_only_builder.builder_from_files(builder_name,
                                                     **builder_kwargs)
            return b
        except registered.DatasetNotFoundError as e:
            pass

    # If code exists and loading from files was skipped (e.g. files not found),
    # load from the source code.
    if cls:
        with py_utils.try_reraise(
                prefix=f'Failed to construct dataset {name}: '):
            return cls(**builder_kwargs)  # pytype: disable=not-instantiable

    # If neither the code nor the files are found, raise DatasetNotFoundError
    raise not_found_error
예제 #10
0
파일: load.py 프로젝트: jeremyd85/datasets
def builder(
    name: str,
    *,
    data_dir: Optional[str] = None,
    try_gcs: bool = False,
    **builder_init_kwargs: Any
) -> dataset_builder.DatasetBuilder:
  """Fetches a `tfds.core.DatasetBuilder` by string name.

  Args:
    name: `str`, the registered name of the `DatasetBuilder` (the class name
      as camel or snake case: `MyDataset` or `my_dataset`).
      This can be either `'dataset_name'` or
      `'dataset_name/config_name'` for datasets with `BuilderConfig`s.
      As a convenience, this string may contain comma-separated keyword
      arguments for the builder. For example `'foo_bar/a=True,b=3'` would use
      the `FooBar` dataset passing the keyword arguments `a=True` and `b=3`
      (for builders with configs, it would be `'foo_bar/zoo/a=True,b=3'` to
      use the `'zoo'` config and pass to the builder keyword arguments `a=True`
      and `b=3`).
    data_dir: Path to the dataset(s). See `tfds.load` for more information.
    try_gcs: `bool`, if True, tfds.load will see if the dataset exists on
      the public GCS bucket before building it locally.
    **builder_init_kwargs: `dict` of keyword arguments passed to the
      `DatasetBuilder`. These will override keyword arguments passed in `name`,
      if any.

  Returns:
    A `tfds.core.DatasetBuilder`.

  Raises:
    DatasetNotFoundError: if `name` is unrecognized.
  """
  builder_name, builder_kwargs = dataset_name_and_kwargs_from_name_str(name)
  # Set data_dir.
  if try_gcs and gcs_utils.is_dataset_on_gcs(builder_name):
    data_dir = gcs_utils.gcs_path("datasets")

  # Try loading the code (if it exists)
  try:
    cls = builder_cls(builder_name)
  except DatasetNotFoundError as e:
    if e.is_abstract:
      raise  # Abstract can't be instanciated neither from code nor files.
    cls = None  # Class not found
    not_found_error = e  # Save the exception to eventually reraise

  version_explicitly_given = "version" in builder_kwargs

  # Try loading from files first:
  # * If code not present.
  # * If version explicitly given (backward/forward compatibility).
  # Note: If `builder_init_kwargs` are set (e.g. version='experimental_latest',
  # custom config,...), read from generation code.
  if (not cls or version_explicitly_given) and not builder_init_kwargs:
    builder_dir = find_builder_dir(name, data_dir=data_dir)
    if builder_dir is not None:  # A generated dataset was found on disk
      return read_only_builder.builder_from_directory(builder_dir)

  # If loading from files was skipped (e.g. files not found), load from the
  # source code.
  if cls:
    with py_utils.try_reraise(prefix=f"Failed to construct dataset {name}: "):
      return cls(data_dir=data_dir, **builder_kwargs, **builder_init_kwargs)  # pytype: disable=not-instantiable

  # If neither the code nor the files are found, raise DatasetNotFoundError
  raise not_found_error