예제 #1
0
 def _log_download_bytes(self):
   # Print is intentional: we want this to always go to stdout so user has
   # information needed to cancel download/preparation if needed.
   # This comes right before the progress bar.
   termcolor.cprint(
       "Downloading and preparing dataset {} (download: {}, generated: {}, "
       "total: {}) to {}...".format(
           self.info.full_name,
           units.size_str(self.info.download_size),
           units.size_str(self.info.dataset_size),
           units.size_str(self.info.download_size + self.info.dataset_size),
           self._data_dir,
       ), attrs=["bold"])
예제 #2
0
 def _log_download_bytes(self):
     # Print is intentional: we want this to always go to stdout so user has
     # information needed to cancel download/preparation if needed.
     # This comes right before the progress bar.
     size_text = units.size_str(self.info.size_in_bytes)
     termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." %
                      (self.name, size_text, self._data_dir),
                      attrs=["bold"])
예제 #3
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(self.info.size_in_bytes,
                                               directory=self._data_dir_root):
            raise IOError("Not enough disk space. Needed: %s" %
                          units.size_str(self.info.size_in_bytes))
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and bool(self.info.splits.total_num_examples)):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.size_in_bytes = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
예제 #4
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        # Disable `download_and_prepare` (internally, we are still
        # allowing Py2 for the `dataset_builder_tests.py` & cie
        if _is_py2_download_and_prepare_disabled and six.PY2:
            raise NotImplementedError(
                "TFDS has dropped `builder.download_and_prepare` support for "
                "Python 2. Please update your code to Python 3.")

        if self.version.tfds_version_to_prepare:
            available_to_prepare = ", ".join(
                str(v) for v in self.versions if not v.tfds_version_to_prepare)
            raise AssertionError(
                "The version of the dataset you are trying to use ({}:{}) can only "
                "be generated using TFDS code synced @ {} or earlier. Either sync to "
                "that version of TFDS to first prepare the data or use another "
                "version of the dataset (available for `download_and_prepare`: "
                "{}).".format(self.name, self.version,
                              self.version.tfds_version_to_prepare,
                              available_to_prepare))

        # Only `cls.VERSION` or `experimental_latest` versions can be generated.
        # Otherwise, users may accidentally generate an old version using the
        # code from newer versions.
        installable_versions = {
            str(v)
            for v in (self.canonical_version, max(self.versions))
        }
        if str(self.version) not in installable_versions:
            msg = (
                "The version of the dataset you are trying to use ({}) is too "
                "old for this version of TFDS so cannot be generated.").format(
                    self.info.full_name)
            if self.version.tfds_version_to_prepare:
                msg += (
                    "{} can only be generated using TFDS code synced @ {} or earlier "
                    "Either sync to that version of TFDS to first prepare the data or "
                    "use another version of the dataset. ").format(
                        self.version, self.version.tfds_version_to_prepare)
            else:
                msg += (
                    "Either sync to a previous version of TFDS to first prepare the "
                    "data or use another version of the dataset. ")
            msg += "Available for `download_and_prepare`: {}".format(
                list(sorted(installable_versions)))
            raise ValueError(msg)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))

        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(
                self.info.dataset_size + self.info.download_size,
                directory=self._data_dir_root):
            raise IOError(
                "Not enough disk space. Needed: {} (download: {}, generated: {})"
                .format(
                    units.size_str(self.info.dataset_size +
                                   self.info.download_size),
                    units.size_str(self.info.download_size),
                    units.size_str(self.info.dataset_size),
                ))
        self._log_download_bytes()

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with utils.incomplete_dir(self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Skip statistics computation if tfdv isn't present
                    try:
                        import tensorflow_data_validation  # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import  # pytype: disable=import-error
                        skip_stats_computation = False
                    except ImportError:
                        skip_stats_computation = True

                    splits = list(self.info.splits.values())
                    statistics_already_computed = bool(
                        splits and splits[0].statistics.num_examples)
                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (skip_stats_computation or download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and statistics_already_computed):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.download_size = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
예제 #5
0
 def test_bytes(self):
     self.assertEqual("150 bytes", units.size_str(150))
예제 #6
0
 def test_normal_sizes(self):
     self.assertEqual("1.50 PiB", units.size_str(1.5 * units.PiB))
     self.assertEqual("1.50 TiB", units.size_str(1.5 * units.TiB))
     self.assertEqual("1.50 GiB", units.size_str(1.5 * units.GiB))
     self.assertEqual("1.50 MiB", units.size_str(1.5 * units.MiB))
     self.assertEqual("1.50 KiB", units.size_str(1.5 * units.KiB))
예제 #7
0
 def test_none(self):
     self.assertEqual("Unknown size", units.size_str(None))
예제 #8
0
 def test_none(self):
     self.assertEqual("?? GiB", units.size_str(None))