def setUp(self): self.hf_modules_cache = tempfile.mkdtemp() self.cache_dir = tempfile.mkdtemp() self.download_config = DownloadConfig(cache_dir=self.cache_dir) self.dynamic_modules_path = datasets.load.init_dynamic_modules( name="test_datasets_modules_" + os.path.basename(self.hf_modules_cache), hf_modules_cache=self.hf_modules_cache, )
def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch): custom_cache_dir = "custom_cache" custom_extracted_dir = "custom_extracted_dir" custom_extracted_path = tmp_path / "custom_extracted_path" if default_extracted: expected = ("downloads" if default_cache_dir else custom_cache_dir, "extracted") else: monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR", custom_extracted_dir) monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(custom_extracted_path)) expected = custom_extracted_path.parts[-2:] if default_cache_dir else (custom_cache_dir, custom_extracted_dir) filename = xz_file download_config = ( DownloadConfig(extract_compressed_file=True) if default_cache_dir else DownloadConfig(cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True) ) extracted_file_path = cached_path(filename, download_config=download_config) assert Path(extracted_file_path).parent.parts[-2:] == expected
def test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path, tmp_path, text_file): input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_path} input_path = str(input_paths[compression_format]) cache_dir = tmp_path / "cache" download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True) extracted_path = cached_path(input_path, download_config=download_config) with open(extracted_path) as f: extracted_file_content = f.read() with open(text_file) as f: expected_file_content = f.read() assert extracted_file_content == expected_file_content
def load_builder_class(self, dataset_name, is_local=False): # Download/copy dataset script if is_local is True: dataset_module = dataset_module_factory( os.path.join("datasets", dataset_name)) else: dataset_module = dataset_module_factory( dataset_name, download_config=DownloadConfig(force_download=True)) # Get dataset builder class builder_cls = import_main_class(dataset_module.module_path) return builder_cls
def test_dummy_data_autogenerate(self): n_lines = 5 with TemporaryDirectory() as tmp_dir: with open(os.path.join(tmp_dir, "train.txt"), "w", encoding="utf-8") as f: f.write("foo\nbar\n" * 10) with open(os.path.join(tmp_dir, "test.txt"), "w", encoding="utf-8") as f: f.write("foo\nbar\n" * 10) class MockDownloadManagerWithCustomDatasetsScriptsDir( MockDownloadManager): datasets_scripts_dir = os.path.join(tmp_dir, "datasets") cache_dir = os.path.join(tmp_dir, "cache") os.makedirs(cache_dir, exist_ok=True) dataset_builder = DummyBuilder(tmp_test_dir=tmp_dir, cache_dir=cache_dir) mock_dl_manager = MockDownloadManagerWithCustomDatasetsScriptsDir( dataset_name=dataset_builder.name, config=None, version=Version("0.0.0"), use_local_dummy_data=True, cache_dir=cache_dir, load_existing_dummy_data=False, # dummy data don't exist yet ) download_config = DownloadConfig(cache_dir=os.path.join( tmp_dir, datasets.config.DOWNLOADED_DATASETS_DIR)) dl_manager = DummyDataGeneratorDownloadManager( dataset_name=dataset_builder.name, mock_download_manager=mock_dl_manager, download_config=download_config, ) dataset_builder.download_and_prepare(dl_manager=dl_manager, try_from_hf_gcs=False) shutil.rmtree(dataset_builder._cache_dir) dl_manager.auto_generate_dummy_data_folder(n_lines=n_lines) path_do_dataset = os.path.join( mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name) dl_manager.compress_autogenerated_dummy_data(path_do_dataset) mock_dl_manager.load_existing_dummy_data = True dataset_builder.download_and_prepare(dl_manager=mock_dl_manager, ignore_verifications=True, try_from_hf_gcs=False) dataset = dataset_builder.as_dataset(split="train") self.assertEqual(len(dataset), n_lines) del dataset
def test_load_real_dataset(self, dataset_name): path = "./datasets/" + dataset_name dataset_module = dataset_module_factory( path, download_config=DownloadConfig(local_files_only=True)) builder_cls = import_main_class(dataset_module.module_path) name = builder_cls.BUILDER_CONFIGS[ 0].name if builder_cls.BUILDER_CONFIGS else None with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset(path, name=name, cache_dir=temp_cache_dir, download_mode=DownloadMode.FORCE_REDOWNLOAD) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def test_download_manager_download(urls_type, tmp_path, monkeypatch): import requests monkeypatch.setattr(requests, "request", mock_request) url = URL if issubclass(urls_type, str): urls = url elif issubclass(urls_type, list): urls = [url] elif issubclass(urls_type, dict): urls = {"train": url} dataset_name = "dummy" cache_subdir = "downloads" cache_dir_root = str(tmp_path) download_config = DownloadConfig( cache_dir=os.path.join(cache_dir_root, cache_subdir), use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) downloaded_paths = dl_manager.download(urls) input_urls = urls for downloaded_paths in [downloaded_paths]: if isinstance(urls, str): downloaded_paths = [downloaded_paths] input_urls = [urls] elif isinstance(urls, dict): assert "train" in downloaded_paths.keys() downloaded_paths = downloaded_paths.values() input_urls = urls.values() assert downloaded_paths for downloaded_path, input_url in zip(downloaded_paths, input_urls): assert downloaded_path == dl_manager.downloaded_paths[input_url] downloaded_path = Path(downloaded_path) parts = downloaded_path.parts assert parts[-1] == HASH assert parts[-2] == cache_subdir assert downloaded_path.exists() content = downloaded_path.read_text() assert content == CONTENT metadata_downloaded_path = downloaded_path.with_suffix(".json") assert metadata_downloaded_path.exists() metadata_content = json.loads(metadata_downloaded_path.read_text()) assert metadata_content == {"url": URL, "etag": None}
def test_load_dataset_deletes_extracted_files(deleted, jsonl_gz_path, tmp_path): data_files = jsonl_gz_path cache_dir = tmp_path / "cache" if deleted: download_config = DownloadConfig(delete_extracted=True, cache_dir=cache_dir / "downloads") ds = load_dataset("json", split="train", data_files=data_files, cache_dir=cache_dir, download_config=download_config) else: # default ds = load_dataset("json", split="train", data_files=data_files, cache_dir=cache_dir) assert ds[0] == {"col_1": "0", "col_2": 0, "col_3": 0.0} assert (sorted( (cache_dir / "downloads" / "extracted").iterdir()) == []) is deleted
def test_download_manager_extract(paths_type, xz_file, text_file): filename = str(xz_file) if issubclass(paths_type, str): paths = filename elif issubclass(paths_type, list): paths = [filename] elif issubclass(paths_type, dict): paths = {"train": filename} dataset_name = "dummy" cache_dir = xz_file.parent extracted_subdir = "extracted" download_config = DownloadConfig( cache_dir=cache_dir, use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) extracted_paths = dl_manager.extract(paths) input_paths = paths for extracted_paths in [extracted_paths]: if isinstance(paths, str): extracted_paths = [extracted_paths] input_paths = [paths] elif isinstance(paths, dict): assert "train" in extracted_paths.keys() extracted_paths = extracted_paths.values() input_paths = paths.values() assert extracted_paths for extracted_path, input_path in zip(extracted_paths, input_paths): assert extracted_path == dl_manager.extracted_paths[input_path] extracted_path = Path(extracted_path) parts = extracted_path.parts assert parts[-1] == hash_url_to_filename(input_path, etag=None) assert parts[-2] == extracted_subdir assert extracted_path.exists() extracted_file_content = extracted_path.read_text() expected_file_content = text_file.read_text() assert extracted_file_content == expected_file_content
def _autogenerate_dummy_data(self, dataset_builder, mock_dl_manager, keep_uncompressed) -> Optional[bool]: dl_cache_dir = (os.path.join( self._cache_dir, config.DOWNLOADED_DATASETS_DIR) if self._cache_dir else config.DOWNLOADED_DATASETS_PATH) download_config = DownloadConfig(cache_dir=dl_cache_dir) dl_manager = DummyDataGeneratorDownloadManager( dataset_name=self._dataset_name, mock_download_manager=mock_dl_manager, download_config=download_config) dataset_builder._split_generators(dl_manager) mock_dl_manager.load_existing_dummy_data = False # don't use real dummy data dl_manager.auto_generate_dummy_data_folder( n_lines=self._n_lines, json_field=self._json_field, xml_tag=self._xml_tag, match_text_files=self._match_text_files, encoding=self._encoding, ) if not keep_uncompressed: path_do_dataset = os.path.join( mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name) dl_manager.compress_autogenerated_dummy_data(path_do_dataset) # now test that the dummy_data.zip file actually works mock_dl_manager.load_existing_dummy_data = True # use real dummy data n_examples_per_split = {} os.makedirs(dataset_builder._cache_dir, exist_ok=True) try: split_generators = dataset_builder._split_generators( mock_dl_manager) for split_generator in split_generators: dataset_builder._prepare_split(split_generator, check_duplicate_keys=False) n_examples_per_split[ split_generator. name] = split_generator.split_info.num_examples except OSError as e: logger.error( f"Failed to load dummy data for config '{dataset_builder.config.name}''.\nOriginal error:\n" + str(e)) return False else: if all(n_examples > 0 for n_examples in n_examples_per_split.values()): logger.warning( f"Dummy data generation done and dummy data test succeeded for config '{dataset_builder.config.name}''." ) return True else: empty_splits = [ split_name for split_name in n_examples_per_split if n_examples_per_split[split_name] == 0 ] logger.warning( f"Dummy data generation done but dummy data test failed since splits {empty_splits} have 0 examples for config '{dataset_builder.config.name}''." ) return False else: generated_dummy_data_dir = os.path.join( self._path_to_dataset, mock_dl_manager.dummy_data_folder) logger.info( f"Dummy data generated in directory '{generated_dummy_data_dir}' but kept uncompressed. " "Please compress this directory into a zip file to use it for dummy data tests." )
def run(self): import apache_beam as beam if self._name is not None and self._all_configs: print( "Both parameters `name` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name dataset_module = dataset_module_factory(path) builder_cls = import_main_class(dataset_module.module_path) builders: List[DatasetBuilder] = [] if self._beam_pipeline_options: beam_options = beam.options.pipeline_options.PipelineOptions( flags=[ f"--{opt.strip()}" for opt in self._beam_pipeline_options.split(",") if opt ]) else: beam_options = None if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for builder_config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls( name=builder_config.name, data_dir=self._data_dir, hash=dataset_module.hash, beam_options=beam_options, cache_dir=self._cache_dir, base_path=dataset_module.builder_kwargs.get( "base_path"), )) else: builders.append( builder_cls( name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir, base_path=dataset_module.builder_kwargs.get("base_path"), **self._config_kwargs, )) for builder in builders: builder.download_and_prepare( download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD, download_config=DownloadConfig( cache_dir=config.DOWNLOADED_DATASETS_PATH), ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) if self._save_infos: builder._save_infos() print("Apache beam run successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print(f"Dataset Infos file saved at {dataset_infos_path}") exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join( dataset_dir, config.DATASETDICT_INFOS_FILENAME) copyfile(dataset_infos_path, user_dataset_infos_path) print(f"Dataset Infos file saved at {user_dataset_infos_path}")