def test_compression_extractors(self, compressed_file, dataset_base_url, dataset_dir, gmb_schema, tmp_path): "Test compression extractors (gzip, bzip2, and lzma) to make sure datasets are properly extracted and verified." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/' + compressed_file compressed_fp = dataset_dir / ('extractables/' + compressed_file) fake_schema['sha512sum'] = hashlib.sha512( (compressed_fp).read_bytes()).hexdigest() dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert dataset.is_downloaded() is True # Content of the file list with open(dataset._file_list_file, mode='r') as f: file_list = json.load(f) def test_incorrect_file_list(change: dict): "Test a single case that somewhere in the file list things are wrong." wrong_file_list = copy.deepcopy(file_list) wrong_file_list['contents'].update(change) with open(dataset._file_list_file, mode='w') as f: json.dump(wrong_file_list, f) assert dataset.is_downloaded() is False # Can't find the file test_incorrect_file_list({'filename': 'non-existing-file'}) # Size incorrect changed = copy.deepcopy(file_list['contents']) changed['size'] += 100 test_incorrect_file_list(changed)
def test_zip_extractor(self, dataset_base_url, dataset_dir, gmb_schema, tmp_path): "Test _ZipExtractor to make sure zip datasets are properly extracted and verified." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/test.zip' fake_schema['sha512sum'] = hashlib.sha512( (dataset_dir / 'extractables/test.zip').read_bytes()).hexdigest() zip_dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert zip_dataset.is_downloaded() is True # Content of the file list with open(zip_dataset._file_list_file, mode='r') as f: file_list = json.load(f) def test_incorrect_file_list(change: dict): "Test a single case that somewhere in the file list things are wrong." wrong_file_list = copy.deepcopy(file_list) wrong_file_list['contents'].update(change) with open(zip_dataset._file_list_file, mode='w') as f: json.dump(wrong_file_list, f) assert zip_dataset.is_downloaded() is False # Can't find a file test_incorrect_file_list({'non-existing-file': {'isdir': False}}) # File type incorrect test_incorrect_file_list({'test-dir/test.csv': {'isdir': True}}) # Size incorrect changed = copy.deepcopy(file_list['contents']['test-dir/test.txt']) changed['size'] += 100 test_incorrect_file_list({'test-dir/test.txt': changed})
def test_is_downloaded(self, tmp_path, gmb_schema): "Test is_downloaded method using a ``.tar.gz`` archive." data_dir = tmp_path / 'non-existing-dir' assert not data_dir.exists() # Sanity check: data_dir must not exist gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert gmb.is_downloaded() is False gmb.download() assert gmb.is_downloaded() is True # JSON decoding error gmb._file_list_file.write_text("nonsense\n", encoding='utf-8') with pytest.raises(JSONDecodeError): # We don't check the value of the exception because we clearly only are only interested in ensuring that the # file isn't decodable gmb.is_downloaded()
def test_supported_file_extensions(self, dataset_base_url, dataset_dir, extractable, extractable_type, gmb_schema, tmp_path): "Test extract_data_files and verify_data_files to make sure proper extractors are used for various datasets." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/' + extractable fake_schema['sha512sum'] = hashlib.sha512( (dataset_dir / 'extractables' / extractable).read_bytes()).hexdigest() dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert dataset.is_downloaded() is True with open(dataset._file_list_file, mode='r') as f: file_list = json.load(f) assert file_list['type'] == extractable_type
def test_deleting_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset.delete()``." # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the # test. data_dir = tmp_path / 'data-dir' dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.delete() # no exception should be raised here assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.download() # Sanity check: Files are in place assert dataset.is_downloaded() assert len(os.listdir(data_dir)) > 0 # Delete the dir dataset.delete() assert not data_dir.exists()