def run(self): set_verbosity_warning() dataset_module = dataset_module_factory(self._path_to_dataset) builder_cls = import_main_class(dataset_module.module_path) # use `None` as config if no configs builder_configs = builder_cls.BUILDER_CONFIGS or [None] auto_generate_results = [] with tempfile.TemporaryDirectory() as tmp_dir: for builder_config in builder_configs: if builder_config is None: name = None version = builder_cls.VERSION else: version = builder_config.version name = builder_config.name dataset_builder = builder_cls(name=name, hash=dataset_module.hash, cache_dir=tmp_dir) mock_dl_manager = MockDownloadManager( dataset_name=self._dataset_name, config=builder_config, version=version, use_local_dummy_data=True, load_existing_dummy_data=False, ) if self._auto_generate: auto_generate_results.append( self._autogenerate_dummy_data( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager, keep_uncompressed=self._keep_uncompressed, ) ) else: self._print_dummy_data_instructions( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager ) if self._auto_generate and not self._keep_uncompressed: if all(auto_generate_results): print(f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'") else: print(f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'")
def check_load_dataset(self, dataset_name, configs, is_local=False, use_local_dummy_data=False): for config in configs: with tempfile.TemporaryDirectory( ) as processed_temp_dir, tempfile.TemporaryDirectory( ) as raw_temp_dir: # create config and dataset dataset_builder_cls = self.load_builder_class( dataset_name, is_local=is_local) name = config.name if config is not None else None dataset_builder = dataset_builder_cls( name=name, cache_dir=processed_temp_dir) # TODO: skip Beam datasets and datasets that lack dummy data for now if not dataset_builder.test_dummy_data: logger.info("Skip tests for this dataset for now") return if config is not None: version = config.version else: version = dataset_builder.VERSION def check_if_url_is_valid(url): if is_remote_url(url) and "\\" in url: raise ValueError( f"Bad remote url '{url} since it contains a backslash" ) # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data mock_dl_manager = MockDownloadManager( dataset_name=dataset_name, config=config, version=version, cache_dir=raw_temp_dir, use_local_dummy_data=use_local_dummy_data, download_callbacks=[check_if_url_is_valid], ) # packaged datasets like csv, text, json or pandas require some data files if dataset_builder.__class__.__name__.lower( ) in _PACKAGED_DATASETS_MODULES: mock_dl_manager.download_dummy_data() path_to_dummy_data = mock_dl_manager.dummy_file dataset_builder.config.data_files = get_packaged_dataset_dummy_data_files( dataset_builder.__class__.__name__.lower(), path_to_dummy_data) # mock size needed for dummy data instead of actual dataset if dataset_builder.info is not None: # approximate upper bound of order of magnitude of dummy data files one_mega_byte = 2 << 19 dataset_builder.info.size_in_bytes = 2 * one_mega_byte dataset_builder.info.download_size = one_mega_byte dataset_builder.info.dataset_size = one_mega_byte # generate examples from dummy data dataset_builder.download_and_prepare( dl_manager=mock_dl_manager, download_mode=GenerateMode.FORCE_REDOWNLOAD, ignore_verifications=True, try_from_hf_gcs=False, ) # get dataset dataset = dataset_builder.as_dataset(ignore_verifications=True) # check that dataset is not empty self.parent.assertListEqual( sorted(dataset_builder.info.splits.keys()), sorted(dataset)) for split in dataset_builder.info.splits.keys(): # check that loaded datset is not empty self.parent.assertTrue(len(dataset[split]) > 0) del dataset