def test_downloaded_dataset_duplicate_sources(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/other.txt') expected_sha_a = hasher.hexdigest() f1 = dataset.DownloadSourceFile('test.txt', url='http://someplace.com/other.txt', sha256=expected_sha_a) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/somethingelse.txt') expected_sha_b = hasher.hexdigest() f2 = dataset.DownloadSourceFile( 'test.txt', url='http://someplace.com/somethingelse.txt', sha256=expected_sha_b) @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt') def downloaded_dataset(source_paths, target_path): raise RuntimeError('Should not get here') with pytest.raises(ValueError): downloaded_dataset() test_config._teardown_batchup_temp(tdir)
def test_downloaded_dataset(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/other.txt') expected_sha_a = hasher.hexdigest() f1 = dataset.DownloadSourceFile('test.txt', url='http://someplace.com/other.txt', sha256=expected_sha_a) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/somethingelse.txt') expected_sha_b = hasher.hexdigest() f2 = dataset.DownloadSourceFile( 'test2.txt', url='http://someplace.com/somethingelse.txt', sha256=expected_sha_b) # Target filename (last arg) must be a string or a callable with pytest.raises(TypeError): @dataset.fetch_and_convert_dataset([f1, f2], 2) def downloaded_dataset(source_paths, target_path): raise RuntimeError('Should not get here') # Source files must contain `AbstractSourceFile` instances with pytest.raises(TypeError): @dataset.fetch_and_convert_dataset([f1, 'test2.txt'], 'ds.txt') def downloaded_dataset(source_paths, target_path): raise RuntimeError('Should not get here') @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt') def downloaded_dataset(source_paths, target_path): p1, p2 = source_paths with open(target_path, 'w') as f_out: f_out.write(open(p1, 'r').read()) f_out.write(open(p2, 'r').read()) return target_path dest = downloaded_dataset() # Check the resulting file assert os.path.exists(dest) assert open(dest, 'r').read() == (f1.url + f2.url) # Ensure that the temporary 'downloaded' files have been cleaned up assert not os.path.exists(f1.path) assert not os.path.exists(f2.path) # Invoking a second time should re-use the existing file dest2 = downloaded_dataset() assert dest2 == dest test_config._teardown_batchup_temp(tdir)
def test_DownloadSourceFile_acquire(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/other.txt') expected_sha = hasher.hexdigest() f1 = dataset.DownloadSourceFile('test.txt', url='http://someplace.com/other.txt', sha256=expected_sha) assert f1.filename == 'test.txt' assert f1.temp_filename == os.path.join('temp', 'test.txt') assert f1.path == os.path.join(tdir, 'data', 'temp', 'test.txt') assert f1.url == 'http://someplace.com/other.txt' assert str(f1) == \ 'downloadable file test.txt from http://someplace.com/other.txt' dest = f1.acquire() assert dest == os.path.join(tdir, 'data', 'temp', 'test.txt') assert os.path.exists(dest) # clean up f1.clean_up() assert not os.path.exists(dest) test_config._teardown_batchup_temp(tdir)
def test_delete_dataset_cache(monkeypatch): from batchup.datasets import dataset import hashlib tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/other.txt') expected_sha_a = hasher.hexdigest() f1 = dataset.DownloadSourceFile('test.txt', url='http://someplace.com/other.txt', sha256=expected_sha_a) hasher = hashlib.sha256() hasher.update(b'http://someplace.com/somethingelse.txt') expected_sha_b = hasher.hexdigest() f2 = dataset.DownloadSourceFile( 'test2.txt', url='http://someplace.com/somethingelse.txt', sha256=expected_sha_b) @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt') def downloaded_dataset(source_paths, target_path): p1, p2 = source_paths with open(target_path, 'w') as f_out: f_out.write(open(p1, 'r').read()) f_out.write(open(p2, 'r').read()) return target_path dest = downloaded_dataset() # Check the resulting file assert os.path.exists(dest) assert open(dest, 'r').read() == (f1.url + f2.url) # Ensure that the temporary 'downloaded' files have been cleaned up assert not os.path.exists(f1.path) assert not os.path.exists(f2.path) # Delete the dataset cache; provide the filename dataset.delete_dataset_cache('ds.txt') assert not os.path.exists(dest) test_config._teardown_batchup_temp(tdir)
def test_DownloadSourceFile_constructor(monkeypatch): from batchup.datasets import dataset _patch_config_datadir(monkeypatch) f1 = dataset.DownloadSourceFile('test.txt', url='http://someplace.com/other.txt') assert f1.filename == 'test.txt' assert f1.temp_filename == os.path.join('temp', 'test.txt') assert f1.path == os.path.join(_get_data_dir(), f1.temp_filename) assert f1.url == 'http://someplace.com/other.txt' assert str(f1) == \ 'downloadable file test.txt from http://someplace.com/other.txt' f2 = dataset.DownloadSourceFile('test.txt', base_url='http://someplace.com') assert f2.filename == 'test.txt' assert f2.temp_filename == os.path.join('temp', 'test.txt') assert f2.path == os.path.join(_get_data_dir(), f1.temp_filename) assert f2.url == 'http://someplace.com/test.txt' with pytest.raises(TypeError): dataset.DownloadSourceFile('test.txt')