def _make_download_manager(self, download_dir, download_config): """Creates a new download manager object.""" download_dir = download_dir or os.path.join(self._data_dir_root, "downloads") extract_dir = (download_config.extract_dir or os.path.join(download_dir, "extracted")) # Use manual_dir only if MANUAL_DOWNLOAD_INSTRUCTIONS are set. if self.MANUAL_DOWNLOAD_INSTRUCTIONS: manual_dir = (download_config.manual_dir or os.path.join(download_dir, "manual")) else: manual_dir = None return download.DownloadManager( dataset_name=self.name, download_dir=download_dir, extract_dir=extract_dir, manual_dir=manual_dir, manual_dir_instructions=utils.dedent( self.MANUAL_DOWNLOAD_INSTRUCTIONS), force_download=(download_config.download_mode == FORCE_REDOWNLOAD), force_extraction=( download_config.download_mode == FORCE_REDOWNLOAD), force_checksums_validation=download_config. force_checksums_validation, register_checksums=download_config.register_checksums, )
def _make_download_manager(self, download_dir, download_config): """Creates a new download manager object.""" download_dir = ( download_dir or os.path.join(self._data_dir_root, "downloads") ) extract_dir = ( download_config.extract_dir or os.path.join(download_dir, "extracted") ) manual_dir = ( download_config.manual_dir or os.path.join(download_dir, "manual") ) if download_config.register_checksums: # Note: Error will be raised here if user try to record checksums # from a `zipapp` register_checksums_path = utils.to_write_path(self._checksums_path) else: register_checksums_path = None return download.DownloadManager( download_dir=download_dir, extract_dir=extract_dir, manual_dir=manual_dir, url_infos=self.url_infos, manual_dir_instructions=self.MANUAL_DOWNLOAD_INSTRUCTIONS, force_download=(download_config.download_mode == FORCE_REDOWNLOAD), force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD), force_checksums_validation=download_config.force_checksums_validation, register_checksums=download_config.register_checksums, register_checksums_path=register_checksums_path, verify_ssl=download_config.verify_ssl, dataset_name=self.name, )
def download_and_prepare(self, cache_dir=None, dl_manager=None): """Downloads and prepares dataset for reading. Subclasses must override _download_and_prepare. Args: cache_dir: (str) Cached directory where to extract the data. If None, a default tmp directory will be used. dl_manager: (`tfds.download.DownloadManager`) DownloadManager to use. Only one of dl_manager and cache_dir can be set Raises: ValueError: If the user defines both cache_dir and dl_manager """ # Both args are set if cache_dir and dl_manager is not None: raise ValueError( "Only one of dl_manager and cache_dir can be defined.") # None are set. Use the data_dir as cache_dir if not cache_dir and dl_manager is None: cache_dir = os.path.join(self._data_dir_root, "tmp") # Create the download manager if cache_dir: dl_manager = download.DownloadManager(cache_dir=cache_dir) # If the dataset already exists (data_dir not empty) and that we do not # overwrite the dataset if (self._data_dir and dl_manager.mode == download.GenerateMode.REUSE_DATASET_IF_EXISTS): tf.logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Otherwise, create a new version in a new data_dir. curr_date = datetime.datetime.now() version_str = curr_date.strftime("v_%Y%m%d_%H%M") data_dir = self._get_data_dir(version=version_str) self._data_dir = None tf.logging.info("Generating dataset %s (%s)", self.name, data_dir) # Print is intentional: we want this to always go to stdout so user has # information needed to cancel download/preparation if needed. # This comes right before the progress bar. size_text = termcolor.colored("%s GB" % self.SIZE or "?", attrs=["bold"]) termcolor.cprint("Downloading / extracting dataset %s (%s) to %s..." % (self.name, size_text, data_dir)) # Wrap the Dataset generation in a .incomplete directory with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp: self._download_and_prepare(dl_manager=dl_manager, data_dir=data_dir_tmp) # Update the DatasetInfo metadata (splits info, num samples,...) self._data_dir = data_dir self.info.update_from_metadata_dir(self._data_dir)
def download_and_prepare(self, cache_dir=None, dl_manager=None): """Downloads and prepares dataset for reading. Subclasses must override _download_and_prepare. Args: cache_dir (str): Cached directory where to extract the data. If None, a default tmp directory will be used. dl_manager (DownloadManager): DownloadManager to use. Only one of dl_manager and cache_dir can be set Raises: ValueError: If the user defines both cache_dir and dl_manager """ # Both args are set if cache_dir and dl_manager is not None: raise ValueError( "Only one of dl_manager and cache_dir can be defined.") # None are set. Use the data_dir as cache_dir if not cache_dir and dl_manager is None: cache_dir = os.path.join(self._data_dir_root, "tmp") # Create the download manager if cache_dir: dl_manager = download.DownloadManager(cache_dir=cache_dir) # If the dataset already exists (data_dir not empty) and that we do not # overwrite the dataset if (self._data_dir and dl_manager.mode == download.GenerateMode.REUSE_DATASET_IF_EXISTS): tf.logging.info("Reusing dataset %s (%s)", self.name, self._data_dir) return # Otherwise, create a new version in a new data_dir. curr_date = datetime.datetime.now() version_str = curr_date.strftime("v_%Y%m%d_%H%M") data_dir = self._get_data_dir(version=version_str) tf.logging.info("Generating dataset %s (%s)", self.name, data_dir) # Wrap the Dataset generation in a .incomplete directory with file_format_adapter.incomplete_dir(data_dir) as data_dir_tmp: # TODO(epot): Data_dir should be an argument of download_and_prepare. # Modify this once a better split API exists. self._data_dir = data_dir_tmp self._download_and_prepare(dl_manager) self._data_dir = data_dir
def _make_download_manager(self, download_dir, download_config): download_dir = download_dir or os.path.join(self._data_dir_root, "downloads") extract_dir = (download_config.extract_dir or os.path.join(download_dir, "extracted")) manual_dir = (download_config.manual_dir or os.path.join(download_dir, "manual")) manual_dir = os.path.join(manual_dir, self.name) return download.DownloadManager( dataset_name=self.name, download_dir=download_dir, extract_dir=extract_dir, manual_dir=manual_dir, force_download=(download_config.download_mode == FORCE_REDOWNLOAD), force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD), register_checksums=download_config.register_checksums, )