예제 #1
0
  def test_force_stats(self):
    # Test when stats already exists but compute_stats='force'

    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
      # No dataset_info restored, so stats are empty
      builder = testing.DummyMnist(data_dir=tmp_dir)
      self.assertEqual(builder.info.splits.total_num_examples, 40)
      self.assertFalse(self.compute_dynamic_property.called)

      download_config = download.DownloadConfig(
          compute_stats=download.ComputeStatsMode.FORCE,
      )
      builder.download_and_prepare(download_config=download_config)

      # Statistics computation should have been recomputed
      self.assertTrue(self.compute_dynamic_property.called)
예제 #2
0
  def test_stats_not_restored_gcs_overwritten(self):
    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
      # If split are different that the one restored, stats should be recomputed
      builder = testing.DummyMnist(data_dir=tmp_dir)
      self.assertEqual(builder.info.splits["train"].statistics.num_examples, 20)
      self.assertFalse(self.compute_dynamic_property.called)

      dl_config = download.DownloadConfig(
          max_examples_per_split=5,
          compute_stats=download.ComputeStatsMode.AUTO,
      )
      builder.download_and_prepare(download_config=dl_config)

      # Statistics should have been recomputed (split different from the
      # restored ones)
      self.assertTrue(self.compute_dynamic_property.called)
  def _download_and_prepare_as_dataset(self, builder):
    with absltest.mock.patch.multiple(
        "tensorflow_datasets.core.download.DownloadManager",
        download_and_extract=self._get_dl_extract_result,
        download=self._get_dl_extract_result,
        manual_dir=self.example_dir,
    ):
      if isinstance(builder, dataset_builder.BeamBasedBuilder):

        # TODO(b/129148632): The current apache-beam 2.11.0 do not work with Py3
        # Update once the new version is out (around April)
        skip_beam_test = bool(six.PY3)
        if skip_beam_test:
          return

        import apache_beam as beam   # pylint: disable=g-import-not-at-top
        # For Beam datasets, set-up the runner config
        beam_runner = None
        beam_options = beam.options.pipeline_options.PipelineOptions()
      else:
        beam_runner = None
        beam_options = None

      # Skip computation, otherwise the computed number of samples won't match
      # the one restored from GCS
      download_config = download.DownloadConfig(
          compute_stats=download.ComputeStatsMode.FORCE,
          beam_runner=beam_runner,
          beam_options=beam_options,
      )
      builder.download_and_prepare(download_config=download_config)

    with self._subTest("as_dataset"):
      self._assertAsDataset(builder)

    with self._subTest("num_examples"):
      self._assertNumSamples(builder)

    with self._subTest("reload"):
      # When reloading the dataset, metadata should been reloaded too.

      builder_reloaded = self._make_builder(config=builder.builder_config)
      self._assertNumSamples(builder_reloaded)

      # After reloading, as_dataset should still be working
      with self._subTest("as_dataset"):
        self._assertAsDataset(builder_reloaded)
    def _download_and_prepare_as_dataset(self, builder):
        # Provide the manual dir only if builder has MANUAL_DOWNLOAD_INSTRUCTIONS
        # set.

        missing_dir_mock = mock.PropertyMock(
            side_effect=Exception("Missing MANUAL_DOWNLOAD_INSTRUCTIONS"))

        manual_dir = (self.dummy_data if builder.MANUAL_DOWNLOAD_INSTRUCTIONS
                      else missing_dir_mock)
        with mock.patch.multiple(
                "tensorflow_datasets.core.download.DownloadManager",
                download_and_extract=self._get_dl_extract_result,
                download=self._get_dl_download_result,
                download_checksums=self._download_checksums,
                manual_dir=manual_dir,
                download_dir=self.dummy_data,
        ):
            # For Beam datasets, set-up the runner config
            beam_runner = None

            download_config = download.DownloadConfig(
                compute_stats=download.ComputeStatsMode.SKIP,
                beam_runner=beam_runner,
            )
            with self._test_key_not_local_path(builder):
                builder.download_and_prepare(download_config=download_config)

        with self._subTest("as_dataset"):
            self._assertAsDataset(builder)

        with self._subTest("num_examples"):
            self._assertNumSamples(builder)

        with self._subTest("reload"):
            # When reloading the dataset, metadata should been reloaded too.

            builder_reloaded = self._make_builder(
                config=builder.builder_config)
            self._assertNumSamples(builder_reloaded)

            # After reloading, as_dataset should still be working
            with self._subTest("as_dataset"):
                self._assertAsDataset(builder_reloaded)

        with self._subTest("config_description"):
            self._test_description_builder_config(builder)
    def test_gcs_not_exists(self):
        # By disabling the patch, and because DummyMnist is not on GCS, we can
        # simulate a new dataset starting from scratch
        self.patch_gcs.stop()
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            builder = testing.DummyMnist(data_dir=tmp_dir)
            # No dataset_info restored, so stats are empty
            self.assertEqual(builder.info.splits.total_num_examples, 0)
            self.assertFalse(self.compute_dynamic_property.called)

            dl_config = download.DownloadConfig(
                compute_stats=download.ComputeStatsMode.AUTO, )
            builder.download_and_prepare(download_config=dl_config)

            # Statistics should have been recomputed
            self.assertTrue(self.compute_dynamic_property.called)
        self.patch_gcs.start()
예제 #6
0
    def test_skip_stats(self):
        # Test when stats do not exists yet and compute_stats='skip'

        # By disabling the patch, and because DummyMnist is not on GCS, we can
        # simulate a new dataset starting from scratch
        self.patch_gcs.stop()
        with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
            # No dataset_info restored, so stats are empty
            builder = testing.DummyMnist(data_dir=tmp_dir, num_shards=5)
            self.assertEqual(builder.info.splits.total_num_examples, 0)
            self.assertFalse(self.compute_dynamic_property.called)

            download_config = download.DownloadConfig(
                compute_stats=download.ComputeStatsMode.SKIP, )
            builder.download_and_prepare(download_config=download_config)

            # Statistics computation should have been skipped
            self.assertEqual(builder.info.splits.total_num_examples, 0)
            self.assertFalse(self.compute_dynamic_property.called)
        self.patch_gcs.start()
예제 #7
0
    def _download_and_prepare_as_dataset(self, builder):
        with absltest.mock.patch.multiple(
                "tensorflow_datasets.core.download.DownloadManager",
                download_and_extract=self._get_dl_extract_result,
                download=self._get_dl_extract_result,
                download_checksums=lambda *_: None,
                manual_dir=self.example_dir,
        ):
            if isinstance(builder, dataset_builder.BeamBasedBuilder):
                import apache_beam as beam  # pylint: disable=g-import-not-at-top
                # For Beam datasets, set-up the runner config
                beam_runner = None
                beam_options = beam.options.pipeline_options.PipelineOptions()
            else:
                beam_runner = None
                beam_options = None

            download_config = download.DownloadConfig(
                compute_stats=download.ComputeStatsMode.FORCE,
                beam_runner=beam_runner,
                beam_options=beam_options,
            )
            builder.download_and_prepare(download_config=download_config)

        with self._subTest("as_dataset"):
            self._assertAsDataset(builder)

        with self._subTest("num_examples"):
            self._assertNumSamples(builder)

        with self._subTest("reload"):
            # When reloading the dataset, metadata should been reloaded too.

            builder_reloaded = self._make_builder(
                config=builder.builder_config)
            self._assertNumSamples(builder_reloaded)

            # After reloading, as_dataset should still be working
            with self._subTest("as_dataset"):
                self._assertAsDataset(builder_reloaded)
예제 #8
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if (data_exists
                and download_config.download_mode == REUSE_DATASET_IF_EXISTS):
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.info.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                self._download_and_prepare(
                    dl_manager=dl_manager,
                    max_examples_per_split=download_config.
                    max_examples_per_split)

                # NOTE: If modifying the lines below to put additional information in
                # DatasetInfo, you'll likely also want to update
                # DatasetInfo.read_from_directory to possibly restore these attributes
                # when reading from package data.

                # Update the DatasetInfo metadata by computing statistics from the data.
                if download_config.compute_stats:
                    already_has_stats = bool(
                        self.info.splits.total_num_examples)
                    if already_has_stats:
                        logging.info(
                            "Skipping computing stats because they are already "
                            "populated.")
                    else:
                        self.info.compute_dynamic_properties()

                        # Set checksums for all files downloaded
                        self.info.download_checksums = (
                            dl_manager.recorded_download_checksums)
                        # Set size of all files downloaded
                        self.info.size_in_bytes = sum(
                            [v for _, v in dl_manager.download_sizes.items()])
                # Write DatasetInfo to disk, even if we haven't computed the statistics.
                self.info.write_to_directory(self._data_dir)
예제 #9
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))
        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(self.info.size_in_bytes,
                                               directory=self._data_dir_root):
            raise IOError("Not enough disk space. Needed: %s" %
                          units.size_str(self.info.size_in_bytes))
        self._log_download_bytes()

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with file_format_adapter.incomplete_dir(
                self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and bool(self.info.splits.total_num_examples)):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.size_in_bytes = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
예제 #10
0
    def download_and_prepare(self, download_dir=None, download_config=None):
        """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored.
        Defaults to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

        download_config = download_config or download.DownloadConfig()
        data_exists = tf.io.gfile.exists(self._data_dir)
        if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
            logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
            return

        # Disable `download_and_prepare` (internally, we are still
        # allowing Py2 for the `dataset_builder_tests.py` & cie
        if _is_py2_download_and_prepare_disabled and six.PY2:
            raise NotImplementedError(
                "TFDS has dropped `builder.download_and_prepare` support for "
                "Python 2. Please update your code to Python 3.")

        if self.version.tfds_version_to_prepare:
            available_to_prepare = ", ".join(
                str(v) for v in self.versions if not v.tfds_version_to_prepare)
            raise AssertionError(
                "The version of the dataset you are trying to use ({}:{}) can only "
                "be generated using TFDS code synced @ {} or earlier. Either sync to "
                "that version of TFDS to first prepare the data or use another "
                "version of the dataset (available for `download_and_prepare`: "
                "{}).".format(self.name, self.version,
                              self.version.tfds_version_to_prepare,
                              available_to_prepare))

        # Only `cls.VERSION` or `experimental_latest` versions can be generated.
        # Otherwise, users may accidentally generate an old version using the
        # code from newer versions.
        installable_versions = {
            str(v)
            for v in (self.canonical_version, max(self.versions))
        }
        if str(self.version) not in installable_versions:
            msg = (
                "The version of the dataset you are trying to use ({}) is too "
                "old for this version of TFDS so cannot be generated.").format(
                    self.info.full_name)
            if self.version.tfds_version_to_prepare:
                msg += (
                    "{} can only be generated using TFDS code synced @ {} or earlier "
                    "Either sync to that version of TFDS to first prepare the data or "
                    "use another version of the dataset. ").format(
                        self.version, self.version.tfds_version_to_prepare)
            else:
                msg += (
                    "Either sync to a previous version of TFDS to first prepare the "
                    "data or use another version of the dataset. ")
            msg += "Available for `download_and_prepare`: {}".format(
                list(sorted(installable_versions)))
            raise ValueError(msg)

        # Currently it's not possible to overwrite the data because it would
        # conflict with versioning: If the last version has already been generated,
        # it will always be reloaded and data_dir will be set at construction.
        if data_exists:
            raise ValueError(
                "Trying to overwrite an existing dataset {} at {}. A dataset with "
                "the same version {} already exists. If the dataset has changed, "
                "please update the version number.".format(
                    self.name, self._data_dir, self.version))

        logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
        if not utils.has_sufficient_disk_space(
                self.info.dataset_size + self.info.download_size,
                directory=self._data_dir_root):
            raise IOError(
                "Not enough disk space. Needed: {} (download: {}, generated: {})"
                .format(
                    units.size_str(self.info.dataset_size +
                                   self.info.download_size),
                    units.size_str(self.info.download_size),
                    units.size_str(self.info.dataset_size),
                ))
        self._log_download_bytes()

        dl_manager = self._make_download_manager(
            download_dir=download_dir, download_config=download_config)

        # Create a tmp dir and rename to self._data_dir on successful exit.
        with utils.incomplete_dir(self._data_dir) as tmp_data_dir:
            # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
            # it to every sub function.
            with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
                if (download_config.try_download_gcs
                        and gcs_utils.is_dataset_on_gcs(self.info.full_name)):
                    logging.warning(GCS_HOSTED_MSG, self.name)
                    gcs_utils.download_gcs_dataset(self.info.full_name,
                                                   self._data_dir)
                    self.info.read_from_directory(self._data_dir)
                else:
                    self._download_and_prepare(dl_manager=dl_manager,
                                               download_config=download_config)

                    # NOTE: If modifying the lines below to put additional information in
                    # DatasetInfo, you'll likely also want to update
                    # DatasetInfo.read_from_directory to possibly restore these attributes
                    # when reading from package data.

                    # Skip statistics computation if tfdv isn't present
                    try:
                        import tensorflow_data_validation  # pylint: disable=g-import-not-at-top,import-outside-toplevel,unused-import  # pytype: disable=import-error
                        skip_stats_computation = False
                    except ImportError:
                        skip_stats_computation = True

                    splits = list(self.info.splits.values())
                    statistics_already_computed = bool(
                        splits and splits[0].statistics.num_examples)
                    # Update DatasetInfo metadata by computing statistics from the data.
                    if (skip_stats_computation or download_config.compute_stats
                            == download.ComputeStatsMode.SKIP
                            or download_config.compute_stats
                            == download.ComputeStatsMode.AUTO
                            and statistics_already_computed):
                        logging.info("Skipping computing stats for mode %s.",
                                     download_config.compute_stats)
                    else:  # Mode is forced or stats do not exists yet
                        logging.info("Computing statistics.")
                        self.info.compute_dynamic_properties()
                    self.info.download_size = dl_manager.downloaded_size
                    # Write DatasetInfo to disk, even if we haven't computed statistics.
                    self.info.write_to_directory(self._data_dir)
        self._log_download_done()
 def _get_dl_config_if_need_to_run(self):
     return download.DownloadConfig(
         beam_options=beam.options.pipeline_options.PipelineOptions(), )
def make_default_config():
    return download.DownloadConfig()