def setUp(self): self.hf_modules_cache = tempfile.mkdtemp() self.cache_dir = tempfile.mkdtemp() self.download_config = DownloadConfig(cache_dir=self.cache_dir) self.dynamic_modules_path = datasets.load.init_dynamic_modules( name="test_datasets_modules_" + os.path.basename(self.hf_modules_cache), hf_modules_cache=self.hf_modules_cache, )
def load_builder_class(self, dataset_name, is_local=False): # Download/copy dataset script if is_local is True: module_path, _ = prepare_module("./datasets/" + dataset_name) else: module_path, _ = prepare_module(dataset_name, download_config=DownloadConfig(force_download=True)) # Get dataset builder class builder_cls = import_main_class(module_path) return builder_cls
def _autogenerate_dummy_data(self, dataset_builder, mock_dl_manager, keep_uncompressed) -> Optional[bool]: dl_cache_dir = ( os.path.join(self._cache_dir, config.DOWNLOADED_DATASETS_DIR) if self._cache_dir else config.DOWNLOADED_DATASETS_PATH ) download_config = DownloadConfig(cache_dir=dl_cache_dir) dl_manager = DummyDataGeneratorDownloadManager( dataset_name=self._dataset_name, mock_download_manager=mock_dl_manager, download_config=download_config ) dataset_builder._split_generators(dl_manager) mock_dl_manager.load_existing_dummy_data = False # don't use real dummy data dl_manager.auto_generate_dummy_data_folder( n_lines=self._n_lines, json_field=self._json_field, xml_tag=self._xml_tag, match_text_files=self._match_text_files, encoding=self._encoding, ) if not keep_uncompressed: path_do_dataset = os.path.join(mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name) dl_manager.compress_autogenerated_dummy_data(path_do_dataset) # now test that the dummy_data.zip file actually works mock_dl_manager.load_existing_dummy_data = True # use real dummy data n_examples_per_split = {} os.makedirs(dataset_builder._cache_dir, exist_ok=True) try: split_generators = dataset_builder._split_generators(mock_dl_manager) for split_generator in split_generators: dataset_builder._prepare_split(split_generator, check_duplicate_keys=False) n_examples_per_split[split_generator.name] = split_generator.split_info.num_examples except OSError as e: logger.error( f"Failed to load dummy data for config '{dataset_builder.config.name}''.\nOriginal error:\n" + str(e) ) return False else: if all(n_examples > 0 for n_examples in n_examples_per_split.values()): logger.warning( f"Dummy data generation done and dummy data test succeeded for config '{dataset_builder.config.name}''." ) return True else: empty_splits = [ split_name for split_name in n_examples_per_split if n_examples_per_split[split_name] == 0 ] logger.warning( f"Dummy data generation done but dummy data test failed since splits {empty_splits} have 0 examples for config '{dataset_builder.config.name}''." ) return False else: generated_dummy_data_dir = os.path.join(self._path_to_dataset, mock_dl_manager.dummy_data_folder) logger.info( f"Dummy data generated in directory '{generated_dummy_data_dir}' but kept uncompressed. " "Please compress this directory into a zip file to use it for dummy data tests." )
def test_cached_path_extract(xz_file, tmp_path, text_file): filename = xz_file cache_dir = tmp_path / "cache" download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True) extracted_filename = cached_path(filename, download_config=download_config) with open(extracted_filename) as f: extracted_file_content = f.read() with open(text_file) as f: expected_file_content = f.read() assert extracted_file_content == expected_file_content
def test_load_real_dataset(self, dataset_name): path = "./datasets/" + dataset_name module_path, hash = prepare_module(path, download_config=DownloadConfig(local_files_only=True), dataset=True) builder_cls = import_main_class(module_path, dataset=True) name = builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset( path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD ) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def test_load_dataset_deletes_extracted_files(deleted, jsonl_gz_path, tmp_path): data_files = jsonl_gz_path cache_dir = tmp_path / "cache" if deleted: download_config = DownloadConfig(delete_extracted=True, cache_dir=cache_dir / "downloads") ds = load_dataset( "json", split="train", data_files=data_files, cache_dir=cache_dir, download_config=download_config ) else: # default ds = load_dataset("json", split="train", data_files=data_files, cache_dir=cache_dir) assert ds[0] == {"col_1": "0", "col_2": 0, "col_3": 0.0} assert (sorted((cache_dir / "downloads" / "extracted").iterdir()) == []) is deleted
def load_builder_class(self, dataset_name, is_local=False): # Download/copy dataset script if is_local is True: dataset_module = dataset_module_factory( os.path.join("datasets", dataset_name)) else: dataset_module = dataset_module_factory( dataset_name, download_config=DownloadConfig(force_download=True)) # Get dataset builder class builder_cls = import_main_class(dataset_module.module_path) return builder_cls
def test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path, tmp_path, text_file): input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_path} input_path = str(input_paths[compression_format]) cache_dir = tmp_path / "cache" download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True) extracted_path = cached_path(input_path, download_config=download_config) with open(extracted_path) as f: extracted_file_content = f.read() with open(text_file) as f: expected_file_content = f.read() assert extracted_file_content == expected_file_content
def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch): custom_cache_dir = "custom_cache" custom_extracted_dir = "custom_extracted_dir" custom_extracted_path = tmp_path / "custom_extracted_path" if default_extracted: expected = ("downloads" if default_cache_dir else custom_cache_dir, "extracted") else: monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR", custom_extracted_dir) monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(custom_extracted_path)) expected = custom_extracted_path.parts[-2:] if default_cache_dir else ( custom_cache_dir, custom_extracted_dir) filename = xz_file download_config = (DownloadConfig( extract_compressed_file=True) if default_cache_dir else DownloadConfig( cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True)) extracted_file_path = cached_path(filename, download_config=download_config) assert Path(extracted_file_path).parent.parts[-2:] == expected
def test_download_manager_download(urls_type, tmp_path, monkeypatch): import requests monkeypatch.setattr(requests, "request", mock_request) url = URL if issubclass(urls_type, str): urls = url elif issubclass(urls_type, list): urls = [url] elif issubclass(urls_type, dict): urls = {"train": url} dataset_name = "dummy" cache_subdir = "downloads" cache_dir_root = str(tmp_path) download_config = DownloadConfig( cache_dir=os.path.join(cache_dir_root, cache_subdir), use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) downloaded_paths = dl_manager.download(urls) input_urls = urls for downloaded_paths in [downloaded_paths]: if isinstance(urls, str): downloaded_paths = [downloaded_paths] input_urls = [urls] elif isinstance(urls, dict): assert "train" in downloaded_paths.keys() downloaded_paths = downloaded_paths.values() input_urls = urls.values() assert downloaded_paths for downloaded_path, input_url in zip(downloaded_paths, input_urls): assert downloaded_path == dl_manager.downloaded_paths[input_url] downloaded_path = Path(downloaded_path) parts = downloaded_path.parts assert parts[-1] == HASH assert parts[-2] == cache_subdir assert downloaded_path.exists() content = downloaded_path.read_text() assert content == CONTENT metadata_downloaded_path = downloaded_path.with_suffix(".json") assert metadata_downloaded_path.exists() metadata_content = json.loads(metadata_downloaded_path.read_text()) assert metadata_content == {"url": URL, "etag": None}
def test_load_real_dataset_all_configs(self, dataset_name): path = "./datasets/" + dataset_name dataset_module = dataset_module_factory( path, download_config=DownloadConfig(local_files_only=True)) builder_cls = import_main_class(dataset_module.module_path) config_names = ([ config.name for config in builder_cls.BUILDER_CONFIGS ] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None]) for name in config_names: with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset( path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def test_download_manager_extract(paths_type, xz_file, text_file): filename = str(xz_file) if issubclass(paths_type, str): paths = filename elif issubclass(paths_type, list): paths = [filename] elif issubclass(paths_type, dict): paths = {"train": filename} dataset_name = "dummy" cache_dir = xz_file.parent extracted_subdir = "extracted" download_config = DownloadConfig( cache_dir=cache_dir, use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) extracted_paths = dl_manager.extract(paths) input_paths = paths for extracted_paths in [extracted_paths]: if isinstance(paths, str): extracted_paths = [extracted_paths] input_paths = [paths] elif isinstance(paths, dict): assert "train" in extracted_paths.keys() extracted_paths = extracted_paths.values() input_paths = paths.values() assert extracted_paths for extracted_path, input_path in zip(extracted_paths, input_paths): assert extracted_path == dl_manager.extracted_paths[input_path] extracted_path = Path(extracted_path) parts = extracted_path.parts assert parts[-1] == hash_url_to_filename(input_path, etag=None) assert parts[-2] == extracted_subdir assert extracted_path.exists() extracted_file_content = extracted_path.read_text() expected_file_content = text_file.read_text() assert extracted_file_content == expected_file_content