示例#1
0
 def setUp(self):
     self.hf_modules_cache = tempfile.mkdtemp()
     self.cache_dir = tempfile.mkdtemp()
     self.download_config = DownloadConfig(cache_dir=self.cache_dir)
     self.dynamic_modules_path = datasets.load.init_dynamic_modules(
         name="test_datasets_modules_" +
         os.path.basename(self.hf_modules_cache),
         hf_modules_cache=self.hf_modules_cache,
     )
示例#2
0
 def load_builder_class(self, dataset_name, is_local=False):
     # Download/copy dataset script
     if is_local is True:
         module_path, _ = prepare_module("./datasets/" + dataset_name)
     else:
         module_path, _ = prepare_module(dataset_name, download_config=DownloadConfig(force_download=True))
     # Get dataset builder class
     builder_cls = import_main_class(module_path)
     return builder_cls
 def _autogenerate_dummy_data(self, dataset_builder, mock_dl_manager, keep_uncompressed) -> Optional[bool]:
     dl_cache_dir = (
         os.path.join(self._cache_dir, config.DOWNLOADED_DATASETS_DIR)
         if self._cache_dir
         else config.DOWNLOADED_DATASETS_PATH
     )
     download_config = DownloadConfig(cache_dir=dl_cache_dir)
     dl_manager = DummyDataGeneratorDownloadManager(
         dataset_name=self._dataset_name, mock_download_manager=mock_dl_manager, download_config=download_config
     )
     dataset_builder._split_generators(dl_manager)
     mock_dl_manager.load_existing_dummy_data = False  # don't use real dummy data
     dl_manager.auto_generate_dummy_data_folder(
         n_lines=self._n_lines,
         json_field=self._json_field,
         xml_tag=self._xml_tag,
         match_text_files=self._match_text_files,
         encoding=self._encoding,
     )
     if not keep_uncompressed:
         path_do_dataset = os.path.join(mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name)
         dl_manager.compress_autogenerated_dummy_data(path_do_dataset)
         # now test that the dummy_data.zip file actually works
         mock_dl_manager.load_existing_dummy_data = True  # use real dummy data
         n_examples_per_split = {}
         os.makedirs(dataset_builder._cache_dir, exist_ok=True)
         try:
             split_generators = dataset_builder._split_generators(mock_dl_manager)
             for split_generator in split_generators:
                 dataset_builder._prepare_split(split_generator, check_duplicate_keys=False)
                 n_examples_per_split[split_generator.name] = split_generator.split_info.num_examples
         except OSError as e:
             logger.error(
                 f"Failed to load dummy data for config '{dataset_builder.config.name}''.\nOriginal error:\n"
                 + str(e)
             )
             return False
         else:
             if all(n_examples > 0 for n_examples in n_examples_per_split.values()):
                 logger.warning(
                     f"Dummy data generation done and dummy data test succeeded for config '{dataset_builder.config.name}''."
                 )
                 return True
             else:
                 empty_splits = [
                     split_name for split_name in n_examples_per_split if n_examples_per_split[split_name] == 0
                 ]
                 logger.warning(
                     f"Dummy data generation done but dummy data test failed since splits {empty_splits} have 0 examples for config '{dataset_builder.config.name}''."
                 )
                 return False
     else:
         generated_dummy_data_dir = os.path.join(self._path_to_dataset, mock_dl_manager.dummy_data_folder)
         logger.info(
             f"Dummy data generated in directory '{generated_dummy_data_dir}' but kept uncompressed. "
             "Please compress this directory into a zip file to use it for dummy data tests."
         )
示例#4
0
def test_cached_path_extract(xz_file, tmp_path, text_file):
    filename = xz_file
    cache_dir = tmp_path / "cache"
    download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)
    extracted_filename = cached_path(filename, download_config=download_config)
    with open(extracted_filename) as f:
        extracted_file_content = f.read()
    with open(text_file) as f:
        expected_file_content = f.read()
    assert extracted_file_content == expected_file_content
示例#5
0
 def test_load_real_dataset(self, dataset_name):
     path = "./datasets/" + dataset_name
     module_path, hash = prepare_module(path, download_config=DownloadConfig(local_files_only=True), dataset=True)
     builder_cls = import_main_class(module_path, dataset=True)
     name = builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None
     with tempfile.TemporaryDirectory() as temp_cache_dir:
         dataset = load_dataset(
             path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD
         )
         for split in dataset.keys():
             self.assertTrue(len(dataset[split]) > 0)
         del dataset
示例#6
0
def test_load_dataset_deletes_extracted_files(deleted, jsonl_gz_path, tmp_path):
    data_files = jsonl_gz_path
    cache_dir = tmp_path / "cache"
    if deleted:
        download_config = DownloadConfig(delete_extracted=True, cache_dir=cache_dir / "downloads")
        ds = load_dataset(
            "json", split="train", data_files=data_files, cache_dir=cache_dir, download_config=download_config
        )
    else:  # default
        ds = load_dataset("json", split="train", data_files=data_files, cache_dir=cache_dir)
    assert ds[0] == {"col_1": "0", "col_2": 0, "col_3": 0.0}
    assert (sorted((cache_dir / "downloads" / "extracted").iterdir()) == []) is deleted
 def load_builder_class(self, dataset_name, is_local=False):
     # Download/copy dataset script
     if is_local is True:
         dataset_module = dataset_module_factory(
             os.path.join("datasets", dataset_name))
     else:
         dataset_module = dataset_module_factory(
             dataset_name,
             download_config=DownloadConfig(force_download=True))
     # Get dataset builder class
     builder_cls = import_main_class(dataset_module.module_path)
     return builder_cls
def test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path,
                             tmp_path, text_file):
    input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_path}
    input_path = str(input_paths[compression_format])
    cache_dir = tmp_path / "cache"
    download_config = DownloadConfig(cache_dir=cache_dir,
                                     extract_compressed_file=True)
    extracted_path = cached_path(input_path, download_config=download_config)
    with open(extracted_path) as f:
        extracted_file_content = f.read()
    with open(text_file) as f:
        expected_file_content = f.read()
    assert extracted_file_content == expected_file_content
示例#9
0
def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file,
                                 tmp_path, monkeypatch):
    custom_cache_dir = "custom_cache"
    custom_extracted_dir = "custom_extracted_dir"
    custom_extracted_path = tmp_path / "custom_extracted_path"
    if default_extracted:
        expected = ("downloads" if default_cache_dir else custom_cache_dir,
                    "extracted")
    else:
        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR",
                            custom_extracted_dir)
        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH",
                            str(custom_extracted_path))
        expected = custom_extracted_path.parts[-2:] if default_cache_dir else (
            custom_cache_dir, custom_extracted_dir)

    filename = xz_file
    download_config = (DownloadConfig(
        extract_compressed_file=True) if default_cache_dir else DownloadConfig(
            cache_dir=tmp_path / custom_cache_dir,
            extract_compressed_file=True))
    extracted_file_path = cached_path(filename,
                                      download_config=download_config)
    assert Path(extracted_file_path).parent.parts[-2:] == expected
示例#10
0
def test_download_manager_download(urls_type, tmp_path, monkeypatch):
    import requests

    monkeypatch.setattr(requests, "request", mock_request)

    url = URL
    if issubclass(urls_type, str):
        urls = url
    elif issubclass(urls_type, list):
        urls = [url]
    elif issubclass(urls_type, dict):
        urls = {"train": url}
    dataset_name = "dummy"
    cache_subdir = "downloads"
    cache_dir_root = str(tmp_path)
    download_config = DownloadConfig(
        cache_dir=os.path.join(cache_dir_root, cache_subdir),
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name,
                                 download_config=download_config)
    downloaded_paths = dl_manager.download(urls)
    input_urls = urls
    for downloaded_paths in [downloaded_paths]:
        if isinstance(urls, str):
            downloaded_paths = [downloaded_paths]
            input_urls = [urls]
        elif isinstance(urls, dict):
            assert "train" in downloaded_paths.keys()
            downloaded_paths = downloaded_paths.values()
            input_urls = urls.values()
        assert downloaded_paths
        for downloaded_path, input_url in zip(downloaded_paths, input_urls):
            assert downloaded_path == dl_manager.downloaded_paths[input_url]
            downloaded_path = Path(downloaded_path)
            parts = downloaded_path.parts
            assert parts[-1] == HASH
            assert parts[-2] == cache_subdir
            assert downloaded_path.exists()
            content = downloaded_path.read_text()
            assert content == CONTENT
            metadata_downloaded_path = downloaded_path.with_suffix(".json")
            assert metadata_downloaded_path.exists()
            metadata_content = json.loads(metadata_downloaded_path.read_text())
            assert metadata_content == {"url": URL, "etag": None}
 def test_load_real_dataset_all_configs(self, dataset_name):
     path = "./datasets/" + dataset_name
     dataset_module = dataset_module_factory(
         path, download_config=DownloadConfig(local_files_only=True))
     builder_cls = import_main_class(dataset_module.module_path)
     config_names = ([
         config.name for config in builder_cls.BUILDER_CONFIGS
     ] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None])
     for name in config_names:
         with tempfile.TemporaryDirectory() as temp_cache_dir:
             dataset = load_dataset(
                 path,
                 name=name,
                 cache_dir=temp_cache_dir,
                 download_mode=GenerateMode.FORCE_REDOWNLOAD)
             for split in dataset.keys():
                 self.assertTrue(len(dataset[split]) > 0)
             del dataset
示例#12
0
def test_download_manager_extract(paths_type, xz_file, text_file):
    filename = str(xz_file)
    if issubclass(paths_type, str):
        paths = filename
    elif issubclass(paths_type, list):
        paths = [filename]
    elif issubclass(paths_type, dict):
        paths = {"train": filename}
    dataset_name = "dummy"
    cache_dir = xz_file.parent
    extracted_subdir = "extracted"
    download_config = DownloadConfig(
        cache_dir=cache_dir,
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name,
                                 download_config=download_config)
    extracted_paths = dl_manager.extract(paths)
    input_paths = paths
    for extracted_paths in [extracted_paths]:
        if isinstance(paths, str):
            extracted_paths = [extracted_paths]
            input_paths = [paths]
        elif isinstance(paths, dict):
            assert "train" in extracted_paths.keys()
            extracted_paths = extracted_paths.values()
            input_paths = paths.values()
        assert extracted_paths
        for extracted_path, input_path in zip(extracted_paths, input_paths):
            assert extracted_path == dl_manager.extracted_paths[input_path]
            extracted_path = Path(extracted_path)
            parts = extracted_path.parts
            assert parts[-1] == hash_url_to_filename(input_path, etag=None)
            assert parts[-2] == extracted_subdir
            assert extracted_path.exists()
            extracted_file_content = extracted_path.read_text()
            expected_file_content = text_file.read_text()
            assert extracted_file_content == expected_file_content