def test_get_downloader_no_protocol_defined(self): url = 'files.fast.ai/data/cifar10.tgz' output_dir = 'output_dir' kwargs = {'ftp_username': '******', 'ftp_password': '******'} with self.assertRaisesRegex(ValueError, f"no protocol supported for {url}"): get_downloader(url, output_dir, 8192, 60, **kwargs)
def test_get_downloader_not_supported_protocol(self): url = 'sftp://ftp.ensemblgenomes.org/pub/plants/release-44/summary.txt' output_dir = 'output_dir' kwargs = {'ftp_username': '******', 'ftp_password': '******'} with self.assertRaisesRegex(ValueError, f"no protocol supported for {url}"): get_downloader(url, output_dir, 8192, 60, **kwargs)
def test_http_file_downloader_download(self, get, info): m = mock_open() mock_request = MagicMock() mock_request.iter_content.return_value = ['xxx'] get.return_value.__enter__.return_value = mock_request url = 'http://files.fast.ai/data/cifar10.tgz' output_dir = 'output_dir' downloader = get_downloader(url, output_dir, chunk_size=8192, timeout=60) with patch('data_downloader.downloader.open', m): downloader.download() info.assert_has_calls([ call( f"downloading file from {url} and saving into {downloader.output_file}." ), call( f"successfully downloaded file {url}. {downloader.output_file} saved." ) ]) m.assert_called_once_with(downloader.output_file, 'wb') handle = m() handle.write.assert_called_once_with('xxx') get.assert_called_once_with(url, timeout=60, stream=True) mock_request.raise_for_status.assert_called_once() mock_request.iter_content.assert_called_once_with(chunk_size=8192)
def test_ftp_file_downloader_download(self, FTP, info): m = mock_open() mock_ftp = MagicMock() FTP.return_value.__enter__.return_value = mock_ftp url = 'ftp://files.fast.ai/data/cifar10.tgz' output_dir = 'output_dir' downloader = get_downloader(url, output_dir, chunk_size=8192, timeout=60, ftp_username='******', ftp_password='******') with patch('data_downloader.downloader.open', m): downloader.download() info.assert_has_calls([ call( f"downloading file from {url} and saving into {downloader.output_file}." ), call( f"successfully downloaded file {url}. {downloader.output_file} saved." ) ]) m.assert_called_once_with(downloader.output_file, 'wb') handle = m() FTP.assert_called_once_with('files.fast.ai', timeout=60) mock_ftp.login.assert_called_once_with('xxx', 'yyy') mock_ftp.cwd.assert_called_once_with(downloader.path) mock_ftp.retrbinary.assert_called_once_with(f'RETR {downloader.file}', handle.write, blocksize=8192)
def test_ftp_downloader_delete(self, _logger, exists, remove): exists.return_value = True url = 'http://files.fast.ai/data/cifar10.tgz' output_dir = 'output_dir' downloader = get_downloader(url, output_dir, chunk_size=8192, timeout=60) downloader.delete() remove.assert_called_once_with(downloader.output_file)
def test_get_downloader_http(self): url = 'http://files.fast.ai/data/cifar10.tgz' output_dir = 'output_dir' actual = get_downloader(url, output_dir, chunk_size=8192, timeout=60) self.assertTrue(isinstance(actual, HttpFileDownloader)) self.assertEqual(actual.url, url) self.assertEqual(actual.chunk_size, 8192) self.assertEqual(actual.timeout, 60) self.assertEqual(actual.output_file, 'output_dir/cifar10.tgz') self.assertEqual(actual.host, 'files.fast.ai') self.assertEqual(actual.file, 'cifar10.tgz') self.assertEqual(actual.path, '/data')
def mock_get_downloader(url, output_dir, chunk_size, timeout, **kwargs): downloader = get_downloader(url, output_dir, chunk_size, timeout, **kwargs) def mock_download(): downloader.download() raise Exception("Mock Exception") if isinstance(downloader, HttpFileDownloader): downloader.download = mock_download return downloader
def test_get_downloader_ftp(self): url = 'ftp://ftp.ensemblgenomes.org/pub/plants/release-44/summary.txt' output_dir = 'output_dir' kwargs = {'ftp_username': '******', 'ftp_password': '******'} actual = get_downloader(url, output_dir, 8192, 60, **kwargs) self.assertTrue(isinstance(actual, FtpFileDownloader)) self.assertEqual(actual.url, url) self.assertEqual(actual.chunk_size, 8192) self.assertEqual(actual.timeout, 60) self.assertEqual(actual.output_file, 'output_dir/summary.txt') self.assertEqual(actual.host, 'ftp.ensemblgenomes.org') self.assertEqual(actual.file, 'summary.txt') self.assertEqual(actual.path, '/pub/plants/release-44') self.assertEqual(actual.username, 'xxx') self.assertEqual(actual.password, 'yyy')
def main(args): """create an output directory, spawn a number of threads and start downloading. After downloading is done, check if there's an error during each download (exception is treated as download failure) and delete the partially downloaded files. """ os.makedirs(args.output, exist_ok=True) with ThreadPoolExecutor(args.threads) as executor: futures = dict() for url in args.url: file_downloader = downloader.get_downloader(url[0], args.output, args.chunk_size, args.timeout, ftp_username=args.ftp_username, ftp_password=args.ftp_password) future = executor.submit(file_downloader.download) futures[future] = file_downloader for future in as_completed(futures): try: future.result() except Exception as e: # this is to ensure to delete partially downloaded file. file_downloader = futures[future] _logger.error("failed to download file from {}, {}.".format(file_downloader.url, e)) file_downloader.delete()