def run(self):
        set_verbosity_warning()
        dataset_module = dataset_module_factory(self._path_to_dataset)
        builder_cls = import_main_class(dataset_module.module_path)

        # use `None` as config if no configs
        builder_configs = builder_cls.BUILDER_CONFIGS or [None]
        auto_generate_results = []
        with tempfile.TemporaryDirectory() as tmp_dir:
            for builder_config in builder_configs:
                if builder_config is None:
                    name = None
                    version = builder_cls.VERSION
                else:
                    version = builder_config.version
                    name = builder_config.name

                dataset_builder = builder_cls(name=name, hash=dataset_module.hash, cache_dir=tmp_dir)
                mock_dl_manager = MockDownloadManager(
                    dataset_name=self._dataset_name,
                    config=builder_config,
                    version=version,
                    use_local_dummy_data=True,
                    load_existing_dummy_data=False,
                )

                if self._auto_generate:
                    auto_generate_results.append(
                        self._autogenerate_dummy_data(
                            dataset_builder=dataset_builder,
                            mock_dl_manager=mock_dl_manager,
                            keep_uncompressed=self._keep_uncompressed,
                        )
                    )
                else:
                    self._print_dummy_data_instructions(
                        dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager
                    )
            if self._auto_generate and not self._keep_uncompressed:
                if all(auto_generate_results):
                    print(f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'")
                else:
                    print(f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'")
예제 #2
0
    def check_load_dataset(self,
                           dataset_name,
                           configs,
                           is_local=False,
                           use_local_dummy_data=False):
        for config in configs:
            with tempfile.TemporaryDirectory(
            ) as processed_temp_dir, tempfile.TemporaryDirectory(
            ) as raw_temp_dir:

                # create config and dataset
                dataset_builder_cls = self.load_builder_class(
                    dataset_name, is_local=is_local)
                name = config.name if config is not None else None
                dataset_builder = dataset_builder_cls(
                    name=name, cache_dir=processed_temp_dir)

                # TODO: skip Beam datasets and datasets that lack dummy data for now
                if not dataset_builder.test_dummy_data:
                    logger.info("Skip tests for this dataset for now")
                    return

                if config is not None:
                    version = config.version
                else:
                    version = dataset_builder.VERSION

                def check_if_url_is_valid(url):
                    if is_remote_url(url) and "\\" in url:
                        raise ValueError(
                            f"Bad remote url '{url} since it contains a backslash"
                        )

                # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data
                mock_dl_manager = MockDownloadManager(
                    dataset_name=dataset_name,
                    config=config,
                    version=version,
                    cache_dir=raw_temp_dir,
                    use_local_dummy_data=use_local_dummy_data,
                    download_callbacks=[check_if_url_is_valid],
                )

                # packaged datasets like csv, text, json or pandas require some data files
                if dataset_builder.__class__.__name__.lower(
                ) in _PACKAGED_DATASETS_MODULES:
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = get_packaged_dataset_dummy_data_files(
                        dataset_builder.__class__.__name__.lower(),
                        path_to_dummy_data)

                # mock size needed for dummy data instead of actual dataset
                if dataset_builder.info is not None:
                    # approximate upper bound of order of magnitude of dummy data files
                    one_mega_byte = 2 << 19
                    dataset_builder.info.size_in_bytes = 2 * one_mega_byte
                    dataset_builder.info.download_size = one_mega_byte
                    dataset_builder.info.dataset_size = one_mega_byte

                # generate examples from dummy data
                dataset_builder.download_and_prepare(
                    dl_manager=mock_dl_manager,
                    download_mode=GenerateMode.FORCE_REDOWNLOAD,
                    ignore_verifications=True,
                    try_from_hf_gcs=False,
                )

                # get dataset
                dataset = dataset_builder.as_dataset(ignore_verifications=True)

                # check that dataset is not empty
                self.parent.assertListEqual(
                    sorted(dataset_builder.info.splits.keys()),
                    sorted(dataset))
                for split in dataset_builder.info.splits.keys():
                    # check that loaded datset is not empty
                    self.parent.assertTrue(len(dataset[split]) > 0)
                del dataset