def test_extract(self): """One file already extracted, one file with NO_EXTRACT, one to extract.""" resource_cached = resource_lib.Resource(path='/dl_dir/cached', extract_method=ZIP) resource_new = resource_lib.Resource(path='/dl_dir/new', extract_method=TAR) resource_noextract = resource_lib.Resource(path='/dl_dir/noextract', extract_method=NO_EXTRACT) files = { 'cached': resource_cached, 'new': resource_new, 'noextract': resource_noextract, } self.existing_paths.append('/extract_dir/ZIP.%s' % resource_cached.fname) extracted_new, self.extract_results[ '/dl_dir/%s' % resource_new.fname] = ( _get_promise_on_event('/extract_dir/TAR.new')) manager = self._get_manager() extracted_new.set() res = manager.extract(files) expected = { 'cached': '/extract_dir/ZIP.%s' % resource_cached.fname, 'new': '/extract_dir/TAR.%s' % resource_new.fname, 'noextract': '/dl_dir/%s' % resource_noextract.fname, } self.assertEqual(res, expected)
def test_download_and_extract(self): url_a = 'http://a/a.zip' url_b = 'http://b/b' sha_contenta = _sha256('content from a.zip') sha_contentb = _sha256('content from b') resource_a = resource_lib.Resource(url=url_a) resource_a.sha256 = sha_contenta resource_b = resource_lib.Resource(url=url_b) resource_b.sha256 = sha_contentb self.file_names[resource_a.fname] = 'a.zip' dl_a, self.dl_results[url_a] = _get_promise_on_event( (sha_contenta, 10)) dl_b, self.dl_results[url_b] = _get_promise_on_event( (sha_contentb, 10)) ext_a, self.extract_results['/dl_dir/%s' % resource_a.fname] = (_get_promise_on_event( '/extract_dir/ZIP.%s' % resource_a.fname)) # url_b doesn't need any extraction. for event in [dl_a, dl_b, ext_a]: event.set() manager = self._get_manager() manager._checksums[url_a] = sha_contenta manager._checksums[url_b] = sha_contentb res = manager.download_and_extract({'a': url_a, 'b': url_b}) expected = { 'a': '/extract_dir/ZIP.%s' % resource_a.fname, 'b': '/dl_dir/%s' % resource_b.fname, } self.assertEqual(res, expected)
def test_download(self): """One file in cache, one not.""" urls = { 'cached': resource_lib.Resource(url='http://a.ch/a'), 'new': resource_lib.Resource(url='https://a.ch/b'), # INFO file of c has been deleted: 'info_deleted': resource_lib.Resource(url='https://a.ch/c'), } urla_sha256 = _sha256('http://a.ch/a') urlb_sha256 = _sha256('https://a.ch/b') urlc_sha256 = _sha256('https://a.ch/c') _ = [ self._add_file(path) for path in [ '/dl_dir/%s' % urla_sha256, '/dl_dir/%s.INFO' % urla_sha256, '/dl_dir/%s' % urlc_sha256, ] ] downloaded_b, self.dl_results[ 'https://a.ch/b'] = _get_promise_on_event(('sha_b', 10)) downloaded_c, self.dl_results[ 'https://a.ch/c'] = _get_promise_on_event(('sha_c', 10)) manager = self._get_manager() res = manager.download(urls, async_=True) self.assertFalse(res.is_fulfilled) downloaded_b.set() downloaded_c.set() downloads = res.get() self.assertEqual( downloads, { 'cached': '/dl_dir/%s' % urla_sha256, 'new': '/dl_dir/%s' % urlb_sha256, 'info_deleted': '/dl_dir/%s' % urlc_sha256, })
def __init__(self, name, url=None): url = url or 'http://foo-bar.ch/%s' % name content = 'content of %s' % name self.url = url self.content = content self.size = len(content) self.sha = _sha256(content) self.size_checksum = (self.size, self.sha) self.checksum_size = (self.sha, self.size) self.resource = resource_lib.Resource(url=url) self.resource_sha = resource_lib.Resource(url=url) self.resource_sha.sha256 = self.sha
def _test_extract(self, method, archive_name, expected_files): from_path = os.path.join(self.test_data, 'archives', archive_name) resource = resource_lib.Resource(path=from_path, extract_method=method) self.extractor.extract(resource, self.to_path).get() for name, content in expected_files.items(): path = os.path.join(self.to_path, name) self.assertEqual(_read(path), content, 'File %s has bad content.' % path)
def test_gzip(self): from_path = os.path.join(self.test_data, 'archives', 'arch1.tar.gz') resource = resource_lib.Resource( path=from_path, extract_method=resource_lib.ExtractMethod.GZIP) self.extractor.extract(resource, self.to_path).get() arch1_path = os.path.join(self.test_data, 'archives', 'arch1.tar') self.assertEqual(_read(self.to_path), _read(arch1_path))
def test_bzip2(self): from_path = os.path.join(self.test_data, 'archives', 'foo.csv.bz2') resource = resource_lib.Resource( path=from_path, extract_method=resource_lib.ExtractMethod.BZIP2) self.extractor.extract(resource, self.to_path).get() foo_csv_path = os.path.join(self.test_data, 'foo.csv') self.assertEqual(_read(self.to_path), _read(foo_csv_path))
def test_force_download_and_extract(self): url = 'http://a/b.tar.gz' resource_ = resource_lib.Resource(url=url) resource_.sha256 = _sha256('content of file') # resource was already downloaded / extracted: self.existing_paths = ['/dl_dir/%s' % resource_.fname, '/extract_dir/TAR_GZ.%s' % resource_.fname] self.file_names[resource_.fname] = 'b.tar.gz' self._write_info('/dl_dir/%s.INFO' % resource_.fname, {'original_fname': 'b.tar.gz'}) dl_a, self.dl_results[url] = _get_promise_on_event((resource_.sha256, 10)) ext_a, self.extract_results['/dl_dir/%s' % resource_.fname] = ( _get_promise_on_event('/extract_dir/TAR_GZ.%s' % resource_.fname)) dl_a.set() ext_a.set() manager = self._get_manager(force_download=True, force_extraction=True, checksums={url: resource_.sha256}) res = manager.download_and_extract(url) expected = '/extract_dir/TAR_GZ.%s' % resource_.fname self.assertEqual(expected, res) # Rename after download: (from_, to), kwargs = self.gfile.rename.call_args self.assertTrue(re.match( r'/dl_dir/%s\.tmp\.[a-h0-9]{32}/b.tar.gz' % resource_.fname, from_)) self.assertEqual('/dl_dir/%s' % resource_.fname, to) self.assertEqual(kwargs, {'overwrite': True}) self.assertEqual(1, self.downloader_download.call_count) self.assertEqual(1, self.extractor_extract.call_count)
def setUp(self): super(DownloaderTest, self).setUp() self.addCleanup(mock.patch.stopall) self.downloader = downloader.get_downloader(10, hashlib.sha256) self.tmp_dir = tempfile.mkdtemp(dir=tf.compat.v1.test.get_temp_dir()) self.url = 'http://example.com/foo.tar.gz' self.resource = resource_lib.Resource(url=self.url) self.path = os.path.join(self.tmp_dir, 'foo.tar.gz') self.incomplete_path = '%s.incomplete' % self.path self.response = b'This \nis an \nawesome\n response!' self.resp_checksum = hashlib.sha256(self.response).hexdigest() self.cookies = {} mock.patch.object( downloader.requests.Session, 'get', lambda *a, **kw: _FakeResponse(self.url, self.response, self. cookies), ).start() self.downloader._pbar_url = mock.MagicMock() self.downloader._pbar_dl_size = mock.MagicMock() mock.patch.object( downloader.urllib.request, 'urlopen', lambda *a, **kw: _FakeResponse(self.url, self.response, self. cookies), ).start()
def _download(self, resource): """Download resource, returns Promise->path to downloaded file.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) url = resource.url if url in self._sizes_checksums: expected_sha256 = self._sizes_checksums[url][1] download_path = self._get_final_dl_path(url, expected_sha256) if not self._force_download and resource.exists_locally( download_path): logging.info('URL %s already downloaded: reusing %s.', url, download_path) self._recorded_sizes_checksums[url] = self._sizes_checksums[ url] return promise.Promise.resolve(download_path) # There is a slight difference between downloader and extractor here: # the extractor manages its own temp directory, while the DownloadManager # manages the temp directory of downloader. download_dir_path = os.path.join( self._download_dir, '%s.tmp.%s' % (resource_lib.get_dl_dirname(url), uuid.uuid4().hex)) tf.io.gfile.makedirs(download_dir_path) logging.info('Downloading %s into %s...', url, download_dir_path) def callback(val): checksum, dl_size = val return self._handle_download_result(resource, download_dir_path, checksum, dl_size) return self._downloader.download(url, download_dir_path).then(callback)
def setUp(self): self.addCleanup(absltest.mock.patch.stopall) self.downloader = downloader.get_downloader(10, hashlib.sha256) self.tmp_dir = tempfile.mkdtemp(dir=tf.compat.v1.test.get_temp_dir()) self.url = 'http://example.com/foo.tar.gz' self.resource = resource_lib.Resource(url=self.url) self.path = os.path.join(self.tmp_dir, 'foo.tar.gz') self.incomplete_path = '%s.incomplete' % self.path self.response = b'This \nis an \nawesome\n response!' self.resp_checksum = hashlib.sha256(self.response).hexdigest() self.cookies = {} absltest.mock.patch.object( downloader.requests.Session, 'get', lambda *a, **kw: _FakeResponse(self.url, self.response, self. cookies), ).start() self.downloader._pbar_url = absltest.mock.MagicMock() self.downloader._pbar_dl_size = absltest.mock.MagicMock() absltest.mock.patch.object( downloader.urllib.request, 'urlopen', lambda *a, **kw: _FakeResponse(self.url, self.response, self. cookies), ).start() if not hasattr(downloader.ssl, '_create_unverified_context'): # To not throw error for python<=2.7.8 while mocking SSLContext functions downloader.ssl.__dict__['_create_unverified_context'] = None downloader.ssl.__dict__['create_default_context'] = None # dummy ssl contexts returns for testing absltest.mock.patch.object(downloader.ssl, '_create_unverified_context', lambda *a, **kw: 'skip_ssl').start() absltest.mock.patch.object(downloader.ssl, 'create_default_context', lambda *a, **kw: 'use_ssl').start()
def _download(self, resource): """Download resource, returns Promise->path to downloaded file.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) resource.sha256 = self._checksums.get(resource.url, None) if not resource.path: resource.path = os.path.join(self._download_dir, resource.fname) if not self._force_download and resource.exists_locally(): logging.info('URL %s already downloaded: reusing %s.', resource.url, resource.path) if self._record_checksum_size: logging.info('Reading checksum and size of %s ...', resource.path) checksum, dl_size = utils.read_checksum_digest(resource.path) self._handle_download_result(resource, None, checksum, dl_size, existing=True) return promise.Promise.resolve(resource.path) # There is a slight difference between downloader and extractor here: # the extractor manages its own temp directory, while the DownloadManager # manages the temp directory of downloader. tmp_dir_path = '%s.tmp.%s' % (resource.path, uuid.uuid4().hex) tf.io.gfile.makedirs(tmp_dir_path) logging.info('Downloading %s into %s...', resource.url, tmp_dir_path) def callback(val): checksum, dl_size = val return self._handle_download_result(resource, tmp_dir_path, checksum, dl_size) return self._downloader.download(resource, tmp_dir_path).then(callback)
def setUp(self): self.addCleanup(absltest.mock.patch.stopall) self.downloader = downloader.get_downloader(10, hashlib.sha256) self.tmp_dir = tempfile.mkdtemp(dir=tf.compat.v1.test.get_temp_dir()) self.url = 'http://example.com/foo.tar.gz' self.resource = resource_lib.Resource(url=self.url) self.path = os.path.join(self.tmp_dir, 'foo.tar.gz') self.incomplete_path = '%s.incomplete' % self.path self.response = b'This \nis an \nawesome\n response!' self.resp_checksum = hashlib.sha256(self.response).hexdigest() self.cookies = {} absltest.mock.patch.object( downloader.requests.Session, 'get', lambda *a, **kw: _FakeResponse(self.url, self.response, self.cookies), ).start() self.downloader._pbar_url = absltest.mock.MagicMock() self.downloader._pbar_dl_size = absltest.mock.MagicMock() def write_fake_ftp_result(_, filename): with open(filename, 'wb') as result: result.write(self.response) absltest.mock.patch.object( downloader.urllib.request, 'urlretrieve', write_fake_ftp_result, ).start()
def test_absolute_path(self): from_path = os.path.join(self.test_data, 'archives', 'absolute_path.tar') resource = resource_lib.Resource( path=from_path, extract_method=resource_lib.ExtractMethod.TAR) promise = self.extractor.extract(resource, self.to_path) with self.assertRaisesRegex(extractor.ExtractError, 'Archive at .* is not safe'): promise.get()
def test_gzip2(self): # Same as previous test, except it is not a .tar.gz, but a .gz. from_path = os.path.join(self.test_data, 'archives', 'foo.csv.gz') resource = resource_lib.Resource( path=from_path, extract_method=resource_lib.ExtractMethod.GZIP) self.extractor.extract(resource, self.to_path).get() foo_csv_path = os.path.join(self.test_data, 'foo.csv') self.assertEqual(_read(self.to_path), _read(foo_csv_path))
def _download_extract(self, resource): """Download-extract `Resource` or url, returns Promise->path.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) def callback(path): resource.path = path return self._extract(resource) return self._download(resource).then(callback)
def test_ftp(self): resource = resource_lib.Resource( url='ftp://*****:*****@example.com/foo.tar.gz') promise = self.downloader.download(resource, self.tmp_dir) checksum, _ = promise.get() self.assertEqual(checksum, self.resp_checksum) with open(self.path, 'rb') as result: self.assertEqual(result.read(), self.response) self.assertFalse(tf.io.gfile.exists(self.incomplete_path))
def test_drive_no_cookies(self): resource = resouce_lib.Resource( url='https://drive.google.com/uc?export=download&id=a1b2bc3') promise = self.downloader.download(resource, self.tmp_dir) checksum, _ = promise.get() self.assertEqual(checksum, self.resp_checksum) with open(self.path, 'rb') as result: self.assertEqual(result.read(), self.response) self.assertFalse(tf.io.gfile.exists(self.incomplete_path))
def get_modelnet40_aligned_data_dir(dl_manager=None): dl_manager = dl_manager or get_dl_manager( dataset_name="modelnet40_aligned") path = dl_manager.download( "https://lmb.informatik.uni-freiburg.de/resources/datasets/ORION/modelnet40_manually_aligned.tar" ) folder = dl_manager.extract( resource_lib.Resource( path=path, extract_method=resource_lib.ExtractMethod.TAR_GZ)) return folder
def test_kaggle_api(self): fname = 'a.csv' with testing.mock_kaggle_api(filenames=[fname, 'b.txt']): resource = resource_lib.Resource( url='kaggle://some-competition/a.csv') promise = self.downloader.download(resource, self.tmp_dir) _, dl_size = promise.get() self.assertEqual(dl_size, len(fname)) with tf.io.gfile.GFile(os.path.join(self.tmp_dir, fname)) as f: self.assertEqual(fname, f.read())
def test_wrong_method(self): from_path = os.path.join(self.test_data, 'archives', 'foo.csv.gz') resource = resource_lib.Resource( path=from_path, extract_method=resource_lib.ExtractMethod.ZIP, url='http://example.com/foo.zip') promise = self.extractor.extract(resource, self.to_path) expected_msg = re.escape( 'foo.csv.gz (http://example.com/foo.zip): File is not a zip file.') with self.assertRaisesRegex(extractor.ExtractError, expected_msg): promise.get()
def test_ftp_error(self): error = downloader.urllib.error.URLError('Problem serving file.') absltest.mock.patch.object( downloader.urllib.request, 'urlretrieve', side_effect=error, ).start() resource = resource_lib.Resource(url='ftp://example.com/foo.tar.gz') promise = self.downloader.download(resource, self.tmp_dir) with self.assertRaises(downloader.urllib.error.URLError): promise.get()
def test_extract(self): """One file already extracted, one file with NO_EXTRACT, one to extract.""" cached = resource_lib.Resource(path='/dl_dir/cached', extract_method=ZIP) new_ = resource_lib.Resource(path='/dl_dir/new', extract_method=TAR) no_extract = resource_lib.Resource(path='/dl_dir/noextract', extract_method=NO_EXTRACT) self.fs.add_file('/extract_dir/ZIP.cached') self.extract_results['/dl_dir/new'] = '/extract_dir/TAR.new' manager = self._get_manager() res = manager.extract({ 'cached': cached, 'new': new_, 'noextract': no_extract, }) expected = _as_path({ 'cached': '/extract_dir/ZIP.cached', 'new': '/extract_dir/TAR.new', 'noextract': '/dl_dir/noextract', }) self.assertEqual(res, expected) self.assertCountEqual(self.extracted_paths, [_as_path('/dl_dir/new')])
def test_download(self): """One file in cache, one not.""" urls = { 'cached': resource_lib.Resource(url='http://a.ch/a'), 'new': resource_lib.Resource(url='https://a.ch/b'), # INFO file of c has been deleted: 'info_deleted': resource_lib.Resource(url='https://a.ch/c'), } afname = resource_lib.Resource(url='http://a.ch/a').fname bfname = resource_lib.Resource(url='https://a.ch/b').fname cfname = resource_lib.Resource(url='https://a.ch/c').fname _ = [ self._add_file(path) for path in [ # pylint: disable=g-complex-comprehension '/dl_dir/%s' % afname, '/dl_dir/%s.INFO' % afname, '/dl_dir/%s' % cfname, ] ] downloaded_b, self.dl_results[ 'https://a.ch/b'] = _get_promise_on_event(('sha_b', 10)) downloaded_c, self.dl_results[ 'https://a.ch/c'] = _get_promise_on_event(('sha_c', 10)) manager = self._get_manager() downloaded_b.set() downloaded_c.set() downloads = manager.download(urls) expected = { 'cached': '/dl_dir/%s' % afname, 'new': '/dl_dir/%s' % bfname, 'info_deleted': '/dl_dir/%s' % cfname, } self.assertEqual(downloads, expected)
def _get_base_resource(manual_dir): # dl_manager doesn't like dropbox apparently... path = os.path.join(manual_dir, "h36m.zip") if not tf.io.gfile.exists(path): if not tf.io.gfile.exists(manual_dir): tf.io.gfile.makedirs(manual_dir) url = "https://www.dropbox.com/s/e35qv3n6zlkouki/h36m.zip" ex = "wget -O %s %s" % (path, url) msg = ("Please manually download files from %s and place it at %s" "e.g.\n%s" % (url, path, ex)) raise AssertionError(msg) return resource_lib.Resource(path=path, extract_method=resource_lib.ExtractMethod.ZIP)
def _extract(self, resource): """Extract a single archive, returns Promise->path to extraction result.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(path=resource) if resource.extract_method == resource_lib.ExtractMethod.NO_EXTRACT: logging.info('Skipping extraction for %s (method=NO_EXTRACT).', resource.path) return promise.Promise.resolve(resource.path) extract_path = os.path.join(self._extract_dir, resource.extract_fname) if not self._force_extraction and tf.io.gfile.exists(extract_path): logging.info('Reusing extraction of %s at %s.', resource.path, extract_path) return promise.Promise.resolve(extract_path) return self._extractor.extract(resource, extract_path)
def test_download_and_extract_already_downloaded(self): url_a = 'http://a/a.zip' resource_a = resource_lib.Resource(url=url_a) self.file_names[resource_a.fname] = 'a.zip' # File was already downloaded: self._add_file('/dl_dir/%s' % resource_a.fname) self._write_info('/dl_dir/%s.INFO' % resource_a.fname, {'original_fname': 'a.zip'}) ext_a, self.extract_results['/dl_dir/%s' % resource_a.fname] = ( _get_promise_on_event('/extract_dir/ZIP.%s' % resource_a.fname)) ext_a.set() manager = self._get_manager() res = manager.download_and_extract(url_a) self.assertEqual(res, '/extract_dir/ZIP.%s' % resource_a.fname)
def test_download(self): """One file in cache, one not.""" resource_a = resource_lib.Resource(url='http://a.ch/a') resource_a.sha256 = _sha256('some content') resource_b = resource_lib.Resource(url='http://a.ch/b') resource_b.sha256 = _sha256('content of b') resource_c = resource_lib.Resource(url='http://a.ch/c') resource_c.sha256 = _sha256('content of c') urls = { 'cached': resource_a, 'new': resource_lib.Resource(url='https://a.ch/b'), # INFO file of c has been deleted: 'info_deleted': resource_lib.Resource(url='https://a.ch/c'), } _ = [self._add_file(path, content) for path, content in [ # pylint: disable=g-complex-comprehension ('/dl_dir/%s' % resource_a.fname, 'content of a'), ('/dl_dir/%s.INFO' % resource_a.fname, 'content of info file for a'), ('/dl_dir/%s' % resource_c.fname, 'content of c'), ]] downloaded_b, self.dl_results['https://a.ch/b'] = _get_promise_on_event( (_sha256('content of b'), 10)) downloaded_c, self.dl_results['https://a.ch/c'] = _get_promise_on_event( (_sha256('content of c'), 10)) manager = self._get_manager(checksums={ resource_a.url: resource_a.sha256, 'https://a.ch/b': resource_b.sha256, 'https://a.ch/c': resource_c.sha256, }) downloaded_b.set() downloaded_c.set() downloads = manager.download(urls) expected = { 'cached': '/dl_dir/%s' % resource_a.fname, 'new': '/dl_dir/%s' % resource_b.fname, 'info_deleted': '/dl_dir/%s' % resource_c.fname, } self.assertEqual(downloads, expected)
def iter_archive(self, resource): """Returns iterator over files within archive. **Important Note**: caller should read files as they are yielded. Reading out of order is slow. Args: resource: path to archive or `tfds.download.Resource`. Returns: Generator yielding tuple (path_within_archive, file_obj). """ if isinstance(resource, six.string_types): resource = resource_lib.Resource(path=resource) return extractor.iter_archive(resource.path, resource.extract_method)
def _extract(self, resource: ExtractPath) -> promise.Promise[ReadOnlyPath]: """Extract a single archive, returns Promise->path to extraction result.""" if isinstance(resource, type_utils.PathLikeCls): resource = resource_lib.Resource(path=resource) path = resource.path extract_method = resource.extract_method if extract_method == resource_lib.ExtractMethod.NO_EXTRACT: logging.info('Skipping extraction for %s (method=NO_EXTRACT).', path) return promise.Promise.resolve(path) method_name = resource_lib.ExtractMethod(extract_method).name extract_path = self._extract_dir / f'{method_name}.{path.name}' if not self._force_extraction and extract_path.exists(): logging.info('Reusing extraction of %s at %s.', path, extract_path) return promise.Promise.resolve(extract_path) return self._extractor.extract(path, extract_method, extract_path)