def test_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset._data_dir``." # Automatic creation dataset = Dataset(gmb_schema, data_dir=tmp_path / 'data_dir', mode=Dataset.InitializationMode.LAZY) assert dataset._data_dir == tmp_path / 'data_dir' # Non-directory present dataset = Dataset(gmb_schema, data_dir='setup.py', mode=Dataset.InitializationMode.LAZY) with pytest.raises(NotADirectoryError) as e: dataset._data_dir assert str(e.value) == f'"{pathlib.Path.cwd()/"setup.py"}" exists and is not a directory.'
def test_unloaded_access_to_data(self, tmp_path, gmb_schema): "Test access to ``Dataset.data`` when no data has been loaded." dataset = Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.LAZY) with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ('Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.') # Same after downloading dataset.download() with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ('Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.')
def test_download_false(self, tmp_path, gmb_schema): "Test to see the function loads properly when download=False and dataset was previously downloaded." init(DATADIR=tmp_path) data_dir = tmp_path / 'gmb' / '1.0.2' gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) gmb_data = load_dataset('gmb', version='1.0.2', download=False) assert gmb.data == gmb_data
def test_invalid_sha512(self, tmp_path, gmb_schema): "Test if Dataset class catches an invalid hash." gmb_schema['sha512sum'] = 'invalid hash example' with pytest.raises(IOError) as e: Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert 'the file may by corrupted' in str(e.value)
def test_default_dataset_schema_name(self, tmp_path, gmb_schema): "Test the default schemata name." init(DATADIR=tmp_path) data_dir = tmp_path / 'default' / 'gmb' / '1.0.2' gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) _get_schemata().schemata['datasets']._schema.pop('name') # Remove the "name" key gmb_data = load_dataset('gmb', version='1.0.2', download=False) assert gmb.data == gmb_data
def test_invalid_tarball(self, tmp_path, gmb_schema, schema_file_https_url, schema_file_relative_dir): "Test if Dataset class catches an invalid tar file." fake_schema = gmb_schema fake_schema['download_url'] = schema_file_https_url + '/datasets.yaml' fake_schema['sha512sum'] = hashlib.sha512((schema_file_relative_dir / 'datasets.yaml').read_bytes()).hexdigest() with pytest.raises(tarfile.ReadError) as e: Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert 'Failed to unarchive' in str(e.value)
def test_deleting_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset.delete()``." # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the # test. data_dir = tmp_path / 'data-dir' dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.delete() # no exception should be raised here assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.download() # Sanity check: Files are in place assert dataset.is_downloaded() assert len(os.listdir(data_dir)) > 0 # Delete the dir dataset.delete() assert not data_dir.exists()
def test_download_data_dir_is_not_a_dir(self, gmb_schema): "Test when downloading when ``data_dir`` exists and is not a dir." # These are raised by pathlib.Path.mkdir # Also see https://bugs.python.org/issue42872 ExceptionClass = FileExistsError if os.name == 'nt' else NotADirectoryError with pytest.raises(ExceptionClass) as e: Dataset(gmb_schema, data_dir='./setup.py', mode=Dataset.InitializationMode.DOWNLOAD_ONLY) # This error message may be generated by pathlib.Path.mkdir() (as in DirectoryLock.lock()). We only make sure # the path is in the string. # On Windows, backslashes in the error message are doubled: # # "[WinError 183] Cannot create a file when that file already exists: 'D:\\\\a\\\\pydax\\\\pydax\\\\setup.py'" assert str(pathlib.Path.cwd() / "setup.py").replace('\\', '\\\\') in str(e.value)
def test_constructor_download_and_load(self, tmp_path, wikitext103_schema): "Test the full power of Dataset.__init__() (mode being ``InitializationMode.DOWNLOAD_AND_LOAD``)." dataset = Dataset(wikitext103_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) assert (hashlib.sha512(dataset.data['train'].encode()).hexdigest() == ('df7615f77cb9dd19975881f271e3e3525bee38c08a67fea36a51c96be69a3ecabc9e05c02cbaf' '6fc63a0082efb44156f61c81061d3b0272bbccd7657c682e791')) assert (hashlib.sha512(dataset.data['valid'].encode()).hexdigest() == ('e4834d365d5f8313503895fd8304d29a566ff4a2df77efb32457fdc353304fb61460511f89bb9' '0f14a47132c1539aaa324d3e71f5f56045a61a7292ad25a3c02')) assert (hashlib.sha512(dataset.data['test'].encode()).hexdigest() == ('6fe665d33c0f788eba76da50539f0ca02432c70c94b788a493da491215e86043fc732dbeef9bb' '49a72341c7283ea55f59d10941ac41f7ac58aea3bdcd72f5cd8'))
def downloaded_tensorflow_speech_commands_dataset( tensorflow_speech_commands_schema) -> Dataset: with TemporaryDirectory() as tmp_data_dir: yield Dataset(tensorflow_speech_commands_schema, data_dir=tmp_data_dir, mode=Dataset.InitializationMode.DOWNLOAD_ONLY)