def _sync_ftp_download(self, url, destination_path): out_path = os.path.join(destination_path, download_util.get_file_name(url)) urllib.request.urlretrieve(url, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer) return hexdigest, size
def _download(self, resource): """Download resource, returns Promise->path to downloaded file.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) resource.sha256 = self._checksums.get(resource.url, None) if not resource.path: resource.path = os.path.join(self._download_dir, resource.fname) if not self._force_download and resource.exists_locally(): logging.info('URL %s already downloaded: reusing %s.', resource.url, resource.path) if self._record_checksum_size: logging.info('Reading checksum and size of %s ...', resource.path) checksum, dl_size = utils.read_checksum_digest(resource.path) self._handle_download_result(resource, None, checksum, dl_size, existing=True) return promise.Promise.resolve(resource.path) # There is a slight difference between downloader and extractor here: # the extractor manages its own temp directory, while the DownloadManager # manages the temp directory of downloader. tmp_dir_path = '%s.tmp.%s' % (resource.path, uuid.uuid4().hex) tf.io.gfile.makedirs(tmp_dir_path) logging.info('Downloading %s into %s...', resource.url, tmp_dir_path) def callback(val): checksum, dl_size = val return self._handle_download_result(resource, tmp_dir_path, checksum, dl_size) return self._downloader.download(resource, tmp_dir_path).then(callback)
def _sync_file_copy(self, filepath: str, destination_path: str) -> checksums_lib.UrlInfo: out_path = os.path.join(destination_path, os.path.basename(filepath)) tf.io.gfile.copy(filepath, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer_cls) return checksums_lib.UrlInfo(checksum=hexdigest, size=size)
def _validate_checksums( url: str, path: ReadOnlyPath, computed_url_info: Optional[checksums.UrlInfo], expected_url_info: Optional[checksums.UrlInfo], force_checksums_validation: bool, ) -> None: """Validate computed_url_info match expected_url_info.""" # If force-checksums validations, both expected and computed url_info # should exists if force_checksums_validation: # Checksum of the downloaded file unknown (for manually downloaded file) if not computed_url_info: computed_url_info = utils.read_checksum_digest(path) # Checksums have not been registered if not expected_url_info: raise ValueError( f'Missing checksums url: {url}, yet ' '`force_checksums_validation=True`. ' 'Did you forgot to register checksums ?' ) if ( expected_url_info and computed_url_info and expected_url_info != computed_url_info ): msg = ( f'Artifact {url}, downloaded to {path}, has wrong checksum. ' f'Expected: {expected_url_info}. Got: {computed_url_info}.' 'To debug, see: ' 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror' ) raise NonMatchingChecksumError(msg)
def _sync_file_copy( self, filepath: str, destination_path: str, ) -> DownloadResult: filename = os.path.basename(filepath) out_path = os.path.join(destination_path, filename) tf.io.gfile.copy(filepath, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer_cls) return DownloadResult( path=utils.as_path(out_path), url_info=checksums_lib.UrlInfo( checksum=hexdigest, size=size, filename=filename, ), )
def _check_manually_downloaded(self, url: str) -> Optional[str]: """Checks if file is already downloaded in manual_dir.""" if not self._manual_dir: # Manual dir not passed return None url_info = self._url_infos.get(url) if not url_info or not url_info.filename: # Filename unknown. return None manual_path = self._manual_dir / url_info.filename if not manual_path.exists(): # File not manually downloaded return None # Eventually check the checksums if self._force_checksums_validation: checksum, _ = utils.read_checksum_digest(manual_path) if checksum != url_info.checksum: raise NonMatchingChecksumError(url, manual_path) return os.fspath(manual_path)
def _validate_checksums( url: str, path: ReadOnlyPath, computed_url_info: Optional[checksums.UrlInfo], expected_url_info: Optional[checksums.UrlInfo], force_checksums_validation: bool, ) -> None: """Validate computed_url_info match expected_url_info.""" # If force-checksums validations, both expected and computed url_info # should exists if force_checksums_validation: # Checksum of the downloaded file unknown (for manually downloaded file) if not computed_url_info: computed_url_info = utils.read_checksum_digest(path) # Checksums have not been registered if not expected_url_info: raise ValueError(f'Missing checksums url: {url}, yet ' '`force_checksums_validation=True`. ' 'Did you forgot to register checksums ?') if (expected_url_info and computed_url_info and expected_url_info != computed_url_info): raise NonMatchingChecksumError(url, path)
def _sync_file_copy(self, filepath, destination_path): out_path = os.path.join(destination_path, os.path.basename(filepath)) tf.io.gfile.copy(filepath, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer) return hexdigest, size