Exemplo n.º 1
0
 def load_builder_class(self, dataset_name, is_local=False):
     # Download/copy dataset script
     if is_local is True:
         dataset_module = dataset_module_factory(
             os.path.join("datasets", dataset_name))
     else:
         dataset_module = dataset_module_factory(
             dataset_name,
             download_config=DownloadConfig(force_download=True))
     # Get dataset builder class
     builder_cls = import_main_class(dataset_module.module_path)
     return builder_cls
Exemplo n.º 2
0
 def test_load_real_dataset(self, dataset_name):
     path = "./datasets/" + dataset_name
     dataset_module = dataset_module_factory(
         path, download_config=DownloadConfig(local_files_only=True))
     builder_cls = import_main_class(dataset_module.module_path)
     name = builder_cls.BUILDER_CONFIGS[
         0].name if builder_cls.BUILDER_CONFIGS else None
     with tempfile.TemporaryDirectory() as temp_cache_dir:
         dataset = load_dataset(path,
                                name=name,
                                cache_dir=temp_cache_dir,
                                download_mode=GenerateMode.FORCE_REDOWNLOAD)
         for split in dataset.keys():
             self.assertTrue(len(dataset[split]) > 0)
         del dataset
Exemplo n.º 3
0
    def run(self):
        set_verbosity_warning()
        dataset_module = dataset_module_factory(self._path_to_dataset)
        builder_cls = import_main_class(dataset_module.module_path)

        # use `None` as config if no configs
        builder_configs = builder_cls.BUILDER_CONFIGS or [None]
        auto_generate_results = []
        with tempfile.TemporaryDirectory() as tmp_dir:
            for builder_config in builder_configs:
                if builder_config is None:
                    name = None
                    version = builder_cls.VERSION
                else:
                    version = builder_config.version
                    name = builder_config.name

                dataset_builder = builder_cls(name=name,
                                              hash=dataset_module.hash,
                                              cache_dir=tmp_dir)
                mock_dl_manager = MockDownloadManager(
                    dataset_name=self._dataset_name,
                    config=builder_config,
                    version=version,
                    use_local_dummy_data=True,
                    load_existing_dummy_data=False,
                )

                if self._auto_generate:
                    auto_generate_results.append(
                        self._autogenerate_dummy_data(
                            dataset_builder=dataset_builder,
                            mock_dl_manager=mock_dl_manager,
                            keep_uncompressed=self._keep_uncompressed,
                        ))
                else:
                    self._print_dummy_data_instructions(
                        dataset_builder=dataset_builder,
                        mock_dl_manager=mock_dl_manager)
            if self._auto_generate and not self._keep_uncompressed:
                if all(auto_generate_results):
                    print(
                        f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'"
                    )
                else:
                    print(
                        f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'"
                    )
Exemplo n.º 4
0
 def test_load_real_dataset_all_configs(self, dataset_name):
     path = "./datasets/" + dataset_name
     dataset_module = dataset_module_factory(
         path, download_config=DownloadConfig(local_files_only=True))
     builder_cls = import_main_class(dataset_module.module_path)
     config_names = ([
         config.name for config in builder_cls.BUILDER_CONFIGS
     ] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None])
     for name in config_names:
         with tempfile.TemporaryDirectory() as temp_cache_dir:
             dataset = load_dataset(
                 path,
                 name=name,
                 cache_dir=temp_cache_dir,
                 download_mode=DownloadMode.FORCE_REDOWNLOAD)
             for split in dataset.keys():
                 self.assertTrue(len(dataset[split]) > 0)
             del dataset
Exemplo n.º 5
0
    def test_dataset_info_available(self, dataset, config_name):

        with TemporaryDirectory() as tmp_dir:
            dataset_module = dataset_module_factory(os.path.join(
                "datasets", dataset),
                                                    cache_dir=tmp_dir,
                                                    local_files_only=True)

            builder_cls = import_main_class(dataset_module.module_path,
                                            dataset=True)

            builder_instance: DatasetBuilder = builder_cls(
                cache_dir=tmp_dir,
                name=config_name,
                hash=dataset_module.hash,
            )

            dataset_info_url = os.path.join(
                HF_GCP_BASE_URL,
                builder_instance._relative_data_dir(with_hash=False),
                config.DATASET_INFO_FILENAME).replace(os.sep, "/")
            datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir)
            self.assertTrue(os.path.exists(datset_info_path))
Exemplo n.º 6
0
    def run(self):
        fl_logger().setLevel(ERROR)
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module = dataset_module_factory(path)
        builder_cls = import_main_class(module.module_path)

        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc
            n_builders += (len(builder_cls.BUILDER_CONFIGS) %
                           self._num_proc) > self._proc_rank
        else:
            n_builders = 1 if self._proc_rank == 0 else 0

        def get_builders() -> Generator[DatasetBuilder, None, None]:
            if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
                    if i % self._num_proc == self._proc_rank:
                        if "name" in module.builder_kwargs:
                            yield builder_cls(
                                cache_dir=self._cache_dir,
                                data_dir=self._data_dir,
                                **module.builder_kwargs,
                            )
                        else:
                            yield builder_cls(
                                name=config.name,
                                cache_dir=self._cache_dir,
                                data_dir=self._data_dir,
                                **module.builder_kwargs,
                            )
            else:
                if self._proc_rank == 0:
                    if "name" in module.builder_kwargs:
                        yield builder_cls(cache_dir=self._cache_dir,
                                          data_dir=self._data_dir,
                                          **module.builder_kwargs)
                    else:
                        yield builder_cls(name=name,
                                          cache_dir=self._cache_dir,
                                          data_dir=self._data_dir,
                                          **module.builder_kwargs)

        for j, builder in enumerate(get_builders()):
            print(
                f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})"
            )
            builder._record_infos = True
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

            # If save_infos=True, the dataset infos file is created next to the loaded module file.
            # Let's move it to the original directory of the dataset script, to allow the user to
            # upload them on S3 at the same time afterwards.
            if self._save_infos:
                dataset_infos_path = os.path.join(
                    builder_cls.get_imported_module_dir(),
                    datasets.config.DATASETDICT_INFOS_FILENAME)
                name = Path(path).name + ".py"
                combined_path = os.path.join(path, name)
                if os.path.isfile(path):
                    dataset_dir = os.path.dirname(path)
                elif os.path.isfile(combined_path):
                    dataset_dir = path
                elif os.path.isdir(
                        path
                ):  # for local directories containing only data files
                    dataset_dir = path
                else:  # in case of a remote dataset
                    dataset_dir = None
                    print(f"Dataset Infos file saved at {dataset_infos_path}")

                # Move dataset_info back to the user
                if dataset_dir is not None:
                    user_dataset_infos_path = os.path.join(
                        dataset_dir,
                        datasets.config.DATASETDICT_INFOS_FILENAME)
                    copyfile(dataset_infos_path, user_dataset_infos_path)
                    print(
                        f"Dataset Infos file saved at {user_dataset_infos_path}"
                    )

            # If clear_cache=True, the download folder and the dataset builder cache directory are deleted
            if self._clear_cache:
                if os.path.isdir(builder._cache_dir):
                    logger.warning(f"Clearing cache at {builder._cache_dir}")
                    rmtree(builder._cache_dir)
                download_dir = os.path.join(
                    self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR)
                if os.path.isdir(download_dir):
                    logger.warning(f"Clearing cache at {download_dir}")
                    rmtree(download_dir)

        print("Test successful.")
Exemplo n.º 7
0
    def run(self):
        import apache_beam as beam

        if self._name is not None and self._all_configs:
            print(
                "Both parameters `name` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        dataset_module = dataset_module_factory(path)
        builder_cls = import_main_class(dataset_module.module_path)
        builders: List[DatasetBuilder] = []
        if self._beam_pipeline_options:
            beam_options = beam.options.pipeline_options.PipelineOptions(
                flags=[
                    f"--{opt.strip()}"
                    for opt in self._beam_pipeline_options.split(",") if opt
                ])
        else:
            beam_options = None
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for builder_config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(
                        name=builder_config.name,
                        data_dir=self._data_dir,
                        hash=dataset_module.hash,
                        beam_options=beam_options,
                        cache_dir=self._cache_dir,
                        base_path=dataset_module.builder_kwargs.get(
                            "base_path"),
                        namespace=dataset_module.builder_kwargs.get(
                            "namespace"),
                    ))
        else:
            builders.append(
                builder_cls(
                    name=name,
                    data_dir=self._data_dir,
                    beam_options=beam_options,
                    cache_dir=self._cache_dir,
                    base_path=dataset_module.builder_kwargs.get("base_path"),
                    namespace=dataset_module.builder_kwargs.get("namespace"),
                ))

        for builder in builders:
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                download_config=DownloadConfig(
                    cache_dir=config.DOWNLOADED_DATASETS_PATH),
                save_infos=self._save_infos,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )

        print("Apache beam run successful.")

        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(
                builder_cls.get_imported_module_dir(),
                config.DATASETDICT_INFOS_FILENAME)

            name = Path(path).name + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print(f"Dataset Infos file saved at {dataset_infos_path}")
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(
                dataset_dir, config.DATASETDICT_INFOS_FILENAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print(f"Dataset Infos file saved at {user_dataset_infos_path}")