def run(self): if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) builders: List[DatasetBuilder] = [] if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls(name=config.name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir)) else: builders.append( builder_cls(name=name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir)) for builder in builders: builder.download_and_prepare( download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD, save_infos=self._save_infos, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) print("Test successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) name = list(filter(lambda x: x, path.split("/")))[-1] + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print("Dataset Infos file saved at {}".format( dataset_infos_path)) exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join( dataset_dir, DATASET_INFOS_DICT_FILE_NAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format( user_dataset_infos_path))
def test_dataset_info_available(self, dataset, config_name): with TemporaryDirectory() as tmp_dir: local_module_path, local_hash = prepare_module( os.path.join("datasets", dataset), dataset=True, cache_dir=tmp_dir, local_files_only=True ) builder_cls = import_main_class(local_module_path, dataset=True) builder_instance: DatasetBuilder = builder_cls( cache_dir=tmp_dir, name=config_name, hash=local_hash, ) dataset_info_url = os.path.join( HF_GCP_BASE_URL, builder_instance._relative_data_dir(with_hash=False), DATASET_INFO_FILENAME ) datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir) self.assertTrue(os.path.exists(datset_info_path))
def run(self): module_path, hash = prepare_module(self._path_to_dataset) builder_cls = import_main_class(module_path) # use `None` as config if no configs configs = builder_cls.BUILDER_CONFIGS or [None] for config in configs: if config is None: name = None version = builder_cls.VERSION else: version = config.version name = config.name dataset_builder = builder_cls(name=name, hash=hash) mock_dl_manager = MockDownloadManager( dataset_name=self._dataset_name, config=config, version=version, is_local=True ) dummy_data_folder = os.path.join(self._path_to_dataset, mock_dl_manager.dummy_data_folder) logger.info(f"Creating dummy folder structure for {dummy_data_folder}... ") os.makedirs(dummy_data_folder, exist_ok=True) try: generator_splits = dataset_builder._split_generators(mock_dl_manager) except FileNotFoundError as e: print( f"Dataset {self._dataset_name} with config {config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}." ) files_to_create = set() split_names = [] dummy_file_name = mock_dl_manager.dummy_file_name for split in generator_splits: logger.info(f"Collecting dummy data file paths to create for {split.name}") split_names.append(split.name) gen_kwargs = split.gen_kwargs generator = dataset_builder._generate_examples(**gen_kwargs) try: dummy_data_guidance_print = "\n" + 30 * "=" + "DUMMY DATA INSTRUCTIONS" + 30 * "=" + "\n" config_string = f"config {config.name} of " if config is not None else "" dummy_data_guidance_print += ( "- In order to create the dummy data for " + config_string + f"{self._dataset_name}, please go into the folder '{dummy_data_folder}' with `cd {dummy_data_folder}` . \n\n" ) # trigger generate function for key, record in generator: pass dummy_data_guidance_print += f"- It appears that the function `_generate_examples(...)` expects one or more files in the folder {dummy_file_name} using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. \n\n" except FileNotFoundError as e: files_to_create.add(e.filename) split_names = ", ".join(split_names) if len(files_to_create) > 0: # no glob.glob(...) in `_generate_examples(...)` if len(files_to_create) == 1 and next(iter(files_to_create)) == dummy_file_name: dummy_data_guidance_print += f"- Please create a single dummy data file called '{next(iter(files_to_create))}' from the folder '{dummy_data_folder}'. Make sure that the dummy data file provides at least one example for the split(s) '{split_names}' \n\n" files_string = dummy_file_name else: files_string = ", ".join(files_to_create) dummy_data_guidance_print += f"- Please create the following dummy data files '{files_string}' from the folder '{dummy_data_folder}'\n\n" dummy_data_guidance_print += f"- For each of the splits '{split_names}', make sure that one or more of the dummy data files provide at least one example \n\n" dummy_data_guidance_print += f"- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to '{files_string}'. In this case please refer to the `_generate_examples(...)` method \n\n" if len(files_to_create) == 1 and next(iter(files_to_create)) == dummy_file_name: dummy_data_guidance_print += f"-After the dummy data file is created, it should be zipped to '{dummy_file_name}.zip' with the command `zip {dummy_file_name}.zip {dummy_file_name}` \n\n" dummy_data_guidance_print += ( f"-You can now delete the file '{dummy_file_name}' with the command `rm {dummy_file_name}` \n\n" ) dummy_data_guidance_print += f"- To get the file '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n" else: dummy_data_guidance_print += f"-After all dummy data files are created, they should be zipped recursively to '{dummy_file_name}.zip' with the command `zip -r {dummy_file_name}.zip {dummy_file_name}/` \n\n" dummy_data_guidance_print += f"-You can now delete the folder '{dummy_file_name}' with the command `rm -r {dummy_file_name}` \n\n" dummy_data_guidance_print += f"- To get the folder '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n" dummy_data_guidance_print += ( f"- Make sure you have created the file '{dummy_file_name}.zip' in '{dummy_data_folder}' \n" ) dummy_data_guidance_print += 83 * "=" + "\n" print(dummy_data_guidance_print)