def _rename_and_get_final_dl_path( self, url: str, path: epath.Path, expected_url_info: Optional[checksums.UrlInfo], computed_url_info: Optional[checksums.UrlInfo], checksum_path: Optional[epath.Path], url_path: epath.Path, ) -> epath.Path: """Eventually rename the downloaded file if checksums were recorded.""" # `path` can be: # * Manually downloaded # * (cached) checksum_path # * (cached) url_path # * `tmp_dir/file` (downloaded path) if self._manual_dir and path.is_relative_to(self._manual_dir): return path # Manually downloaded data elif path == checksum_path: # Path already at final destination assert computed_url_info == expected_url_info # Sanity check return checksum_path # pytype: disable=bad-return-type elif path == url_path: if checksum_path: # Checksums were registered: Rename -> checksums_path resource_lib.rename_info_file(path, checksum_path, overwrite=True) return path.replace(checksum_path) else: # Checksums not registered: -> do nothing return path else: # Path was downloaded in tmp dir dst_path = checksum_path or url_path resource_lib.write_info_file( url=url, path=dst_path, dataset_name=self._dataset_name, original_fname=path.name, url_info=computed_url_info, ) path.replace(dst_path) path.parent.rmdir() # Cleanup tmp dir (will fail if dir not empty) return dst_path
def _handle_download_result(self, resource, tmp_dir_path, sha256, dl_size): """Store dled file to definitive place, write INFO file, return path.""" fnames = tf.io.gfile.listdir(tmp_dir_path) if len(fnames) > 1: raise AssertionError('More than one file in %s.' % tmp_dir_path) original_fname = fnames[0] tmp_path = os.path.join(tmp_dir_path, original_fname) self._recorded_sizes_checksums[resource.url] = (dl_size, sha256) if self._register_checksums: self._record_sizes_checksums() elif (dl_size, sha256) != self._sizes_checksums.get(resource.url, None): raise NonMatchingChecksumError(resource.url, tmp_path) download_path = self._get_final_dl_path(resource.url, sha256) resource_lib.write_info_file(resource, download_path, self._dataset_name, original_fname) # Unconditionally overwrite because either file doesn't exist or # FORCE_DOWNLOAD=true tf.io.gfile.rename(tmp_path, download_path, overwrite=True) tf.io.gfile.rmtree(tmp_dir_path) return download_path
def _handle_download_result( self, resource: resource_lib.Resource, tmp_dir_path: str, url_path: str, url_info: checksums.UrlInfo, ) -> str: """Post-processing of the downloaded file. * Write `.INFO` file * Rename `tmp_dir/file.xyz` -> `url_path` * Validate/record checksums * Eventually rename `url_path` -> `file_path` when `record_checksums=True` Args: resource: The url to download. tmp_dir_path: Temporary dir where the file was downloaded. url_path: Destination path. url_info: File checksums, size, computed during download. Returns: dst_path: `url_path` (or `file_path` when `register_checksums=True`) Raises: NonMatchingChecksumError: """ # Extract the file name, path from the tmp_dir fnames = tf.io.gfile.listdir(tmp_dir_path) if len(fnames) != 1: raise ValueError( 'Download not found for url {} in: {}. Found {} files, but expected ' '1.'.format(resource.url, tmp_dir_path, len(fnames))) original_fname, = fnames # Unpack list tmp_path = os.path.join(tmp_dir_path, original_fname) # Write `.INFO` file and rename `tmp_dir/file.xyz` -> `url_path` resource_lib.write_info_file( resource=resource, path=url_path, dataset_name=self._dataset_name, original_fname=original_fname, url_info=url_info, ) # Unconditionally overwrite because either file doesn't exist or # FORCE_DOWNLOAD=true tf.io.gfile.rename(tmp_path, url_path, overwrite=True) tf.io.gfile.rmtree(tmp_dir_path) # After this checkpoint, the url file is cached, so should never be # downloaded again, even if there are error in registering checksums. # Even if `_handle_download_result` is executed asyncronously, Python # built-in ops are atomic in CPython (and Pypy), so it should be safe # to update `_recorded_url_infos`. self._recorded_url_infos[resource.url] = url_info # Validate the download checksum, or register checksums dst_path = url_path if self._register_checksums: # Change `dst_path` from `url_path` -> `file_path` dst_path = self._save_url_info_and_rename(url=resource.url, url_path=url_path, url_info=url_info) elif resource.url not in self._url_infos: if self._force_checksums_validation: raise ValueError(f'Missing checksums url: {resource.url}, yet ' '`force_checksums_validation=True`. ' 'Did you forgot to register checksums ?') # Otherwise, missing checksums, do nothing elif url_info != self._url_infos.get(resource.url, None): raise NonMatchingChecksumError(resource.url, tmp_path) return dst_path