def _sync_file_copy(self, filepath: str, destination_path: str) -> checksums_lib.UrlInfo: out_path = os.path.join(destination_path, os.path.basename(filepath)) tf.io.gfile.copy(filepath, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer_cls) return checksums_lib.UrlInfo(checksum=hexdigest, size=size)
def test_download_url_info_in_info_file_missmatch(self): """Tests failure when downloaded checksums and `.INFO` mismatch.""" a = Artifact('x') self.dl_results[a.url] = a.url_info # Download the url once dl_manager = self._get_manager(register_checksums=False) dl_manager.download(a.url) # The second time, download the url with a different checksum self.dl_results[a.url] = checksums_lib.UrlInfo( size=a.url_info.size, checksum=_sha256('Other content'), filename=a.url_info.filename, ) dl_manager = self._get_manager( register_checksums=False, force_download=True, ) with self.assertRaisesRegexp(ValueError, 'contains a different checksum'): dl_manager.download(a.url) # If the url is re-downloaded with the same hash, no error is raised self.dl_results[a.url] = a.url_info dl_manager = self._get_manager( register_checksums=False, force_download=True, ) dl_manager.download(a.url)
def _sync_download(self, url: str, destination_path: str, verify: bool = True) -> checksums_lib.UrlInfo: """Synchronous version of `download` method. To download through a proxy, the `HTTP_PROXY`, `HTTPS_PROXY`, `REQUESTS_CA_BUNDLE`,... environment variables can be exported, as described in: https://requests.readthedocs.io/en/master/user/advanced/#proxies Args: url: url to download destination_path: path where to write it verify: whether to verify ssl certificates Returns: None Raises: DownloadError: when download fails. """ try: # If url is on a filesystem that gfile understands, use copy. Otherwise, # use requests (http) or urllib (ftp). if not url.startswith('http'): return self._sync_file_copy(url, destination_path) except tf.errors.UnimplementedError: pass with _open_url(url, verify=verify) as (response, iter_content): fname = _get_filename(response) path = os.path.join(destination_path, fname) size = 0 # Initialize the download size progress bar size_mb = 0 unit_mb = units.MiB total_size = int(response.headers.get('Content-length', 0)) // unit_mb self._pbar_dl_size.update_total(total_size) with tf.io.gfile.GFile(path, 'wb') as file_: checksum = self._checksumer_cls() for block in iter_content: size += len(block) checksum.update(block) file_.write(block) # Update the download size progress bar size_mb += len(block) if size_mb > unit_mb: self._pbar_dl_size.update(size_mb // unit_mb) size_mb %= unit_mb self._pbar_url.update(1) return checksums_lib.UrlInfo( checksum=checksum.hexdigest(), size=size, filename=fname, )
def _read_url_info(url_path: str) -> checksums.UrlInfo: """Loads the `UrlInfo` from the `.INFO` file.""" file_info = resource_lib.read_info_file(url_path) if 'url_info' not in file_info: raise ValueError( 'Could not found `url_info` in {}. This likelly indicates that ' 'the files where downloaded with a previous version of TFDS (<=3.1.0). ' ) return checksums.UrlInfo(**file_info['url_info'])
def test_checksums(tmp_path: pathlib.Path): path = tmp_path / 'checksums.tsv' url_infos = { 'http://abc.org/data': checksums.UrlInfo( checksum='abcd', size=1234, filename='a.zip', ), 'http://edf.org/data': checksums.UrlInfo( checksum='abcd', size=1234, filename='b.zip', ), } checksums.save_url_infos(path, url_infos) loaded_url_infos = checksums.load_url_infos(path) assert loaded_url_infos == url_infos
def _read_url_info(url_path: type_utils.PathLike) -> checksums.UrlInfo: """Loads the `UrlInfo` from the `.INFO` file.""" file_info = resource_lib.read_info_file(url_path) if 'url_info' not in file_info: raise ValueError( 'Could not found `url_info` in {}. This likelly indicates that ' 'the files where downloaded with a previous version of TFDS (<=3.1.0). ' ) url_info = file_info['url_info'] url_info.setdefault('filename', None) return checksums.UrlInfo(**url_info)
def __init__(self, name, url=None): url = url or f'http://foo-bar.ch/{name}' content = f'content of {name}' self.url = url self.url_info = checksums_lib.UrlInfo( size=len(content), checksum=_sha256(content), ) self.file_name = resource_lib.get_dl_fname(url, self.url_info.checksum) self.file_path = f'/dl_dir/{self.file_name}' self.url_name = resource_lib.get_dl_fname(url, _sha256(url)) self.url_path = f'/dl_dir/{self.url_name}'
def test_compute_url_info(): filepath = utils.tfds_path() / 'testing/test_data/6pixels.png' expected_url_info = checksums.UrlInfo( checksum= '04f38ebed34d3b027d2683193766155912fba647158c583c3bdb4597ad8af34c', size=utils.Size(102), filename='6pixels.png', ) url_info = checksums.compute_url_info(filepath, checksum_cls=hashlib.sha256) assert url_info == expected_url_info assert url_info.filename == expected_url_info.filename
def test_wrong_checksum(self): a = Artifact('a.tar.gz') sha_b = _sha256('content of another file') self.dl_results[a.url] = a.url_info manager = self._get_manager( register_checksums=False, url_infos={ a.url: checksums_lib.UrlInfo(size=a.url_info.size, checksum=sha_b), }, ) with self.assertRaises(dm.NonMatchingChecksumError): manager.download(a.url)
def _sync_file_copy( self, filepath: str, destination_path: str, ) -> DownloadResult: filename = os.path.basename(filepath) out_path = os.path.join(destination_path, filename) tf.io.gfile.copy(filepath, out_path) hexdigest, size = utils.read_checksum_digest( out_path, checksum_cls=self._checksumer_cls) return DownloadResult( path=utils.as_path(out_path), url_info=checksums_lib.UrlInfo( checksum=hexdigest, size=size, filename=filename, ), )
def _sync_kaggle_download(self, kaggle_url, destination_path): """Download with Kaggle API.""" kaggle_file = kaggle.KaggleFile.from_url(kaggle_url) downloader = self.kaggle_downloader(kaggle_file.competition) filepath = downloader.download_file(kaggle_file.filename, destination_path) dl_size = tf.io.gfile.stat(filepath).length checksum = self._checksumer_cls() with tf.io.gfile.GFile(filepath, 'rb') as f: while True: block = f.read(io.DEFAULT_BUFFER_SIZE) if not block: break checksum.update(block) return checksums_lib.UrlInfo( checksum=checksum.hexdigest(), size=dl_size, )