def _read_url_info(url_path: epath.PathLike) -> checksums.UrlInfo: """Loads the `UrlInfo` from the `.INFO` file.""" file_info = resource_lib.read_info_file(url_path) if 'url_info' not in file_info: raise ValueError( 'Could not find `url_info` in {}. This likely indicates that ' 'the files where downloaded with a previous version of TFDS (<=3.1.0). ' ) url_info = file_info['url_info'] url_info.setdefault('filename', None) url_info['size'] = utils.Size(url_info['size']) return checksums.UrlInfo(**url_info)
def test_compute_url_info(): filepath = utils.tfds_path() / 'testing/test_data/6pixels.png' expected_url_info = checksums.UrlInfo( checksum= '04f38ebed34d3b027d2683193766155912fba647158c583c3bdb4597ad8af34c', size=utils.Size(102), filename='6pixels.png', ) url_info = checksums.compute_url_info(filepath, checksum_cls=hashlib.sha256) assert url_info == expected_url_info assert url_info.filename == expected_url_info.filename
def compute_url_info( path: utils.PathLike, checksum_cls=hashlib.sha256, ) -> UrlInfo: """Locally compute size, checksums of the given file.""" path = utils.as_path(path) checksum = checksum_cls() size = 0 with path.open('rb') as f: while True: block = f.read(io.DEFAULT_BUFFER_SIZE) size += len(block) if not block: break checksum.update(block) return UrlInfo( checksum=checksum.hexdigest(), # base64 digest would have been better. size=utils.Size(size), filename=path.name, )
def _parse_url_infos(checksums_file: Iterable[str]) -> Dict[str, UrlInfo]: """Returns {URL: (size, checksum)}s stored within given file.""" url_infos = {} for line in checksums_file: line = line.strip() # Remove the trailing '\r' on Windows OS. if not line or line.startswith('#'): continue values = line.split('\t') if len(values) == 1: # not enough values to unpack (legacy files) # URL might have spaces inside, but size and checksum will not. values = line.rsplit(' ', 2) if len(values) == 4: url, size, checksum, filename = values elif len(values) == 3: url, size, checksum = values filename = None else: raise AssertionError(f'Error parsing checksums: {values}') url_infos[url] = UrlInfo( size=utils.Size(size), checksum=checksum, filename=filename, ) return url_infos
def download_size(self) -> utils.Size: """Downloaded files size, in bytes.""" # Fallback to deprecated `size_in_bytes` if `download_size` is empty. return utils.Size(self.as_proto.download_size or self.as_proto.size_in_bytes)
def dataset_size(self) -> utils.Size: """Generated dataset files size, in bytes.""" # For old datasets, maybe empty. return utils.Size( sum(split.num_bytes for split in self.splits.values()))
def _sync_download(self, url: str, destination_path: str, verify: bool = True) -> DownloadResult: """Synchronous version of `download` method. To download through a proxy, the `HTTP_PROXY`, `HTTPS_PROXY`, `REQUESTS_CA_BUNDLE`,... environment variables can be exported, as described in: https://requests.readthedocs.io/en/master/user/advanced/#proxies Args: url: url to download destination_path: path where to write it verify: whether to verify ssl certificates Returns: None Raises: DownloadError: when download fails. """ try: # If url is on a filesystem that gfile understands, use copy. Otherwise, # use requests (http) or urllib (ftp). if not url.startswith('http'): return self._sync_file_copy(url, destination_path) except tf.errors.UnimplementedError: pass with _open_url(url, verify=verify) as (response, iter_content): fname = _get_filename(response) path = os.path.join(destination_path, fname) size = 0 # Initialize the download size progress bar size_mb = 0 unit_mb = units.MiB total_size = int(response.headers.get('Content-length', 0)) // unit_mb self._pbar_dl_size.update_total(total_size) with tf.io.gfile.GFile(path, 'wb') as file_: checksum = self._checksumer_cls() for block in iter_content: size += len(block) checksum.update(block) file_.write(block) # Update the download size progress bar size_mb += len(block) if size_mb > unit_mb: self._pbar_dl_size.update(size_mb // unit_mb) size_mb %= unit_mb self._pbar_url.update(1) return DownloadResult( path=utils.as_path(path), url_info=checksums_lib.UrlInfo( checksum=checksum.hexdigest(), size=utils.Size(size), filename=fname, ), )