Exemplo n.º 1
0
    def test_dummy_data_autogenerate(self):
        n_lines = 5

        with TemporaryDirectory() as tmp_dir:
            with open(os.path.join(tmp_dir, "train.txt"),
                      "w",
                      encoding="utf-8") as f:
                f.write("foo\nbar\n" * 10)
            with open(os.path.join(tmp_dir, "test.txt"), "w",
                      encoding="utf-8") as f:
                f.write("foo\nbar\n" * 10)

            class MockDownloadManagerWithCustomDatasetsScriptsDir(
                    MockDownloadManager):
                datasets_scripts_dir = os.path.join(tmp_dir, "datasets")

            cache_dir = os.path.join(tmp_dir, "cache")
            os.makedirs(cache_dir, exist_ok=True)
            dataset_builder = DummyBuilder(tmp_test_dir=tmp_dir,
                                           cache_dir=cache_dir)
            mock_dl_manager = MockDownloadManagerWithCustomDatasetsScriptsDir(
                dataset_name=dataset_builder.name,
                config=None,
                version=Version("0.0.0"),
                use_local_dummy_data=True,
                cache_dir=cache_dir,
                load_existing_dummy_data=False,  # dummy data don't exist yet
            )
            download_config = DownloadConfig(cache_dir=os.path.join(
                tmp_dir, datasets.config.DOWNLOADED_DATASETS_DIR))
            dl_manager = DummyDataGeneratorDownloadManager(
                dataset_name=dataset_builder.name,
                mock_download_manager=mock_dl_manager,
                download_config=download_config,
            )
            dataset_builder.download_and_prepare(dl_manager=dl_manager,
                                                 try_from_hf_gcs=False)
            shutil.rmtree(dataset_builder._cache_dir)

            dl_manager.auto_generate_dummy_data_folder(n_lines=n_lines)
            path_do_dataset = os.path.join(
                mock_dl_manager.datasets_scripts_dir,
                mock_dl_manager.dataset_name)
            dl_manager.compress_autogenerated_dummy_data(path_do_dataset)

            mock_dl_manager.load_existing_dummy_data = True
            dataset_builder.download_and_prepare(dl_manager=mock_dl_manager,
                                                 ignore_verifications=True,
                                                 try_from_hf_gcs=False)
            dataset = dataset_builder.as_dataset(split="train")
            self.assertEqual(len(dataset), n_lines)
            del dataset
Exemplo n.º 2
0
def test_download_manager_download(urls_type, tmp_path, monkeypatch):
    import requests

    monkeypatch.setattr(requests, "request", mock_request)

    url = URL
    if issubclass(urls_type, str):
        urls = url
    elif issubclass(urls_type, list):
        urls = [url]
    elif issubclass(urls_type, dict):
        urls = {"train": url}
    dataset_name = "dummy"
    cache_subdir = "downloads"
    cache_dir_root = str(tmp_path)
    download_config = DownloadConfig(
        cache_dir=os.path.join(cache_dir_root, cache_subdir),
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name,
                                 download_config=download_config)
    downloaded_paths = dl_manager.download(urls)
    input_urls = urls
    for downloaded_paths in [downloaded_paths]:
        if isinstance(urls, str):
            downloaded_paths = [downloaded_paths]
            input_urls = [urls]
        elif isinstance(urls, dict):
            assert "train" in downloaded_paths.keys()
            downloaded_paths = downloaded_paths.values()
            input_urls = urls.values()
        assert downloaded_paths
        for downloaded_path, input_url in zip(downloaded_paths, input_urls):
            assert downloaded_path == dl_manager.downloaded_paths[input_url]
            downloaded_path = Path(downloaded_path)
            parts = downloaded_path.parts
            assert parts[-1] == HASH
            assert parts[-2] == cache_subdir
            assert downloaded_path.exists()
            content = downloaded_path.read_text()
            assert content == CONTENT
            metadata_downloaded_path = downloaded_path.with_suffix(".json")
            assert metadata_downloaded_path.exists()
            metadata_content = json.loads(metadata_downloaded_path.read_text())
            assert metadata_content == {"url": URL, "etag": None}
Exemplo n.º 3
0
def test_download_manager_extract(paths_type, xz_file, text_file):
    filename = str(xz_file)
    if issubclass(paths_type, str):
        paths = filename
    elif issubclass(paths_type, list):
        paths = [filename]
    elif issubclass(paths_type, dict):
        paths = {"train": filename}
    dataset_name = "dummy"
    cache_dir = xz_file.parent
    extracted_subdir = "extracted"
    download_config = DownloadConfig(
        cache_dir=cache_dir,
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name,
                                 download_config=download_config)
    extracted_paths = dl_manager.extract(paths)
    input_paths = paths
    for extracted_paths in [extracted_paths]:
        if isinstance(paths, str):
            extracted_paths = [extracted_paths]
            input_paths = [paths]
        elif isinstance(paths, dict):
            assert "train" in extracted_paths.keys()
            extracted_paths = extracted_paths.values()
            input_paths = paths.values()
        assert extracted_paths
        for extracted_path, input_path in zip(extracted_paths, input_paths):
            assert extracted_path == dl_manager.extracted_paths[input_path]
            extracted_path = Path(extracted_path)
            parts = extracted_path.parts
            assert parts[-1] == hash_url_to_filename(input_path, etag=None)
            assert parts[-2] == extracted_subdir
            assert extracted_path.exists()
            extracted_file_content = extracted_path.read_text()
            expected_file_content = text_file.read_text()
            assert extracted_file_content == expected_file_content
Exemplo n.º 4
0
    def run(self):
        import apache_beam as beam

        if self._name is not None and self._all_configs:
            print("Both parameters `name` and `all_configs` can't be used at once.")
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)
        builders: List[DatasetBuilder] = []
        if self._beam_pipeline_options:
            beam_options = beam.options.pipeline_options.PipelineOptions(
                flags=["--%s" % opt.strip() for opt in self._beam_pipeline_options.split(",") if opt]
            )
        else:
            beam_options = None
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for builder_config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(
                        name=builder_config.name,
                        data_dir=self._data_dir,
                        hash=hash,
                        beam_options=beam_options,
                        cache_dir=self._cache_dir,
                    )
                )
        else:
            builders.append(
                builder_cls(name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir)
            )

        for builder in builders:
            builder.download_and_prepare(
                download_mode=GenerateMode.REUSE_CACHE_IF_EXISTS
                if not self._force_redownload
                else GenerateMode.FORCE_REDOWNLOAD,
                download_config=DownloadConfig(cache_dir=os.path.join(config.HF_DATASETS_CACHE, "downloads")),
                save_infos=self._save_infos,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )

        print("Apache beam run successful.")

        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(builder_cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME)

            name = Path(path).name + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print("Dataset Infos file saved at {}".format(dataset_infos_path))
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(dataset_dir, config.DATASETDICT_INFOS_FILENAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print("Dataset Infos file saved at {}".format(user_dataset_infos_path))