def run(self): fl_logger().setLevel(ERROR) if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc n_builders += (len(builder_cls.BUILDER_CONFIGS) % self._num_proc) > self._proc_rank else: n_builders = 1 if self._proc_rank == 0 else 0 def get_builders() -> Generator[DatasetBuilder, None, None]: if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for i, config in enumerate(builder_cls.BUILDER_CONFIGS): if i % self._num_proc == self._proc_rank: yield builder_cls(name=config.name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir) else: if self._proc_rank == 0: yield builder_cls(name=name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir) for j, builder in enumerate(get_builders()): print( f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})" ) builder.download_and_prepare( download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) builder.as_dataset() if self._save_infos: builder._save_infos() # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset dataset_dir = None print("Dataset Infos file saved at {}".format( dataset_infos_path)) # Move dataset_info back to the user if dataset_dir is not None: user_dataset_infos_path = os.path.join( dataset_dir, DATASET_INFOS_DICT_FILE_NAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format( user_dataset_infos_path)) # If clear_cache=True, the download forlder and the dataset builder cache directory are deleted if self._clear_cache: if os.path.isdir(builder._cache_dir): logger.warning(f"Clearing cache at {builder._cache_dir}") rmtree(builder._cache_dir) download_dir = os.path.join(self._cache_dir, "downloads") if os.path.isdir(download_dir): logger.warning(f"Clearing cache at {download_dir}") rmtree(download_dir) print("Test successful.")
def run(self): fl_logger().setLevel(ERROR) if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module = dataset_module_factory(path) builder_cls = import_main_class(module.module_path) n_builders = len( builder_cls.BUILDER_CONFIGS ) if self._all_configs and builder_cls.BUILDER_CONFIGS else 1 def get_builders() -> Generator[DatasetBuilder, None, None]: if self._all_configs and builder_cls.BUILDER_CONFIGS: for i, config in enumerate(builder_cls.BUILDER_CONFIGS): if "name" in module.builder_kwargs: yield builder_cls( cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs, ) else: yield builder_cls( name=config.name, cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs, ) else: if "name" in module.builder_kwargs: yield builder_cls(cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs) else: yield builder_cls(name=name, cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs) for j, builder in enumerate(get_builders()): print( f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})" ) builder._record_infos = True builder.download_and_prepare( download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) builder.as_dataset() if self._save_infos: builder._save_infos() # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), datasets.config.DATASETDICT_INFOS_FILENAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path elif os.path.isdir( path ): # for local directories containing only data files dataset_dir = path else: # in case of a remote dataset dataset_dir = None print(f"Dataset Infos file saved at {dataset_infos_path}") # Move dataset_info back to the user if dataset_dir is not None: user_dataset_infos_path = os.path.join( dataset_dir, datasets.config.DATASETDICT_INFOS_FILENAME) copyfile(dataset_infos_path, user_dataset_infos_path) print( f"Dataset Infos file saved at {user_dataset_infos_path}" ) # If clear_cache=True, the download folder and the dataset builder cache directory are deleted if self._clear_cache: if os.path.isdir(builder._cache_dir): logger.warning(f"Clearing cache at {builder._cache_dir}") rmtree(builder._cache_dir) download_dir = os.path.join( self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR) if os.path.isdir(download_dir): logger.warning(f"Clearing cache at {download_dir}") rmtree(download_dir) print("Test successful.")