예제 #1
0
    def run(self):
        set_verbosity_warning()
        module_path, hash = prepare_module(self._path_to_dataset)
        builder_cls = import_main_class(module_path)

        # use `None` as config if no configs
        configs = builder_cls.BUILDER_CONFIGS or [None]
        auto_generate_results = []
        with tempfile.TemporaryDirectory() as tmp_dir:
            for config in configs:
                if config is None:
                    name = None
                    version = builder_cls.VERSION
                else:
                    version = config.version
                    name = config.name

                dataset_builder = builder_cls(name=name,
                                              hash=hash,
                                              cache_dir=tmp_dir)
                mock_dl_manager = MockDownloadManager(
                    dataset_name=self._dataset_name,
                    config=config,
                    version=version,
                    is_local=True,
                    load_existing_dummy_data=False,
                )

                if self._auto_generate:
                    auto_generate_results.append(
                        self._autogenerate_dummy_data(
                            dataset_builder=dataset_builder,
                            mock_dl_manager=mock_dl_manager,
                            keep_uncompressed=self._keep_uncompressed,
                        ))
                else:
                    self._print_dummy_data_instructions(
                        dataset_builder=dataset_builder,
                        mock_dl_manager=mock_dl_manager)
            if self._auto_generate and not self._keep_uncompressed:
                if all(auto_generate_results):
                    print(
                        f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'"
                    )
                else:
                    print(
                        f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'"
                    )
예제 #2
0
def get_compatible_task_template(task: str, dataset: str, config: str = None):
    module, module_hash = prepare_module(dataset)
    builder_cls = import_main_class(module)
    builder = builder_cls(hash=module_hash, name=config)
    templates = builder.info.task_templates
    if templates:
        compatible_templates = [template for template in templates if template.task == task]
        if not compatible_templates:
            raise ValueError(f"❌ Task `{task}` is not compatible with dataset `{dataset}`!")
        if len(compatible_templates) > 1:
            raise ValueError(
                f"❌ Expected 1 task template but found {len(compatible_templates)}! Please ensure that `datasets.DatasetInfo.task_templates` contains a unique set of task types."
            )
        return compatible_templates[0]
    else:
        return None
예제 #3
0
 def test_load_real_dataset_all_configs(self, dataset_name):
     path = "./datasets/" + dataset_name
     dataset_module = dataset_module_factory(
         path, download_config=DownloadConfig(local_files_only=True))
     builder_cls = import_main_class(dataset_module.module_path)
     config_names = ([
         config.name for config in builder_cls.BUILDER_CONFIGS
     ] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None])
     for name in config_names:
         with tempfile.TemporaryDirectory() as temp_cache_dir:
             dataset = load_dataset(
                 path,
                 name=name,
                 cache_dir=temp_cache_dir,
                 download_mode=GenerateMode.FORCE_REDOWNLOAD)
             for split in dataset.keys():
                 self.assertTrue(len(dataset[split]) > 0)
             del dataset
예제 #4
0
    def test_dataset_info_available(self, dataset, config_name):

        with TemporaryDirectory() as tmp_dir:
            local_module_path, local_hash = prepare_module(
                os.path.join("datasets", dataset), dataset=True, cache_dir=tmp_dir, local_files_only=True
            )

            builder_cls = import_main_class(local_module_path, dataset=True)

            builder_instance: DatasetBuilder = builder_cls(
                cache_dir=tmp_dir,
                name=config_name,
                hash=local_hash,
            )

            dataset_info_url = os.path.join(
                HF_GCP_BASE_URL, builder_instance._relative_data_dir(with_hash=False), DATASET_INFO_FILENAME
            ).replace(os.sep, "/")
            datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir)
            self.assertTrue(os.path.exists(datset_info_path))
예제 #5
0
파일: test.py 프로젝트: yngtodd/datasets
    def run(self):
        fl_logger().setLevel(ERROR)
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)

        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc
            n_builders += (len(builder_cls.BUILDER_CONFIGS) %
                           self._num_proc) > self._proc_rank
        else:
            n_builders = 1 if self._proc_rank == 0 else 0

        def get_builders() -> Generator[DatasetBuilder, None, None]:
            if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
                    if i % self._num_proc == self._proc_rank:
                        yield builder_cls(name=config.name,
                                          hash=hash,
                                          cache_dir=self._cache_dir,
                                          data_dir=self._data_dir)
            else:
                if self._proc_rank == 0:
                    yield builder_cls(name=name,
                                      hash=hash,
                                      cache_dir=self._cache_dir,
                                      data_dir=self._data_dir)

        for j, builder in enumerate(get_builders()):
            print(
                f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})"
            )
            builder.download_and_prepare(
                download_mode=REUSE_CACHE_IF_EXISTS
                if not self._force_redownload else FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

            # If save_infos=True, the dataset infos file is created next to the loaded module file.
            # Let's move it to the original directory of the dataset script, to allow the user to
            # upload them on S3 at the same time afterwards.
            if self._save_infos:
                dataset_infos_path = os.path.join(
                    builder_cls.get_imported_module_dir(),
                    DATASET_INFOS_DICT_FILE_NAME)
                name = Path(path).name + ".py"
                combined_path = os.path.join(path, name)
                if os.path.isfile(path):
                    dataset_dir = os.path.dirname(path)
                elif os.path.isfile(combined_path):
                    dataset_dir = path
                else:  # in case of a remote dataset
                    dataset_dir = None
                    print("Dataset Infos file saved at {}".format(
                        dataset_infos_path))

                # Move dataset_info back to the user
                if dataset_dir is not None:
                    user_dataset_infos_path = os.path.join(
                        dataset_dir, DATASET_INFOS_DICT_FILE_NAME)
                    copyfile(dataset_infos_path, user_dataset_infos_path)
                    print("Dataset Infos file saved at {}".format(
                        user_dataset_infos_path))

            # If clear_cache=True, the download forlder and the dataset builder cache directory are deleted
            if self._clear_cache:
                if os.path.isdir(builder._cache_dir):
                    logger.warning(f"Clearing cache at {builder._cache_dir}")
                    rmtree(builder._cache_dir)
                download_dir = os.path.join(self._cache_dir, "downloads")
                if os.path.isdir(download_dir):
                    logger.warning(f"Clearing cache at {download_dir}")
                    rmtree(download_dir)

        print("Test successful.")
예제 #6
0
    def run(self):
        import apache_beam as beam

        if self._name is not None and self._all_configs:
            print("Both parameters `name` and `all_configs` can't be used at once.")
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)
        builders: List[DatasetBuilder] = []
        if self._beam_pipeline_options:
            beam_options = beam.options.pipeline_options.PipelineOptions(
                flags=["--%s" % opt.strip() for opt in self._beam_pipeline_options.split(",") if opt]
            )
        else:
            beam_options = None
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(
                        name=config.name,
                        data_dir=self._data_dir,
                        hash=hash,
                        beam_options=beam_options,
                        cache_dir=self._cache_dir,
                    )
                )
        else:
            builders.append(
                builder_cls(name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir)
            )

        for builder in builders:
            builder.download_and_prepare(
                download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD,
                download_config=DownloadConfig(cache_dir=os.path.join(HF_DATASETS_CACHE, "downloads")),
                save_infos=self._save_infos,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )

        print("Apache beam run successful.")

        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME)

            name = list(filter(lambda x: x, path.split("/")))[-1] + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print("Dataset Infos file saved at {}".format(dataset_infos_path))
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(dataset_dir, DATASET_INFOS_DICT_FILE_NAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print("Dataset Infos file saved at {}".format(user_dataset_infos_path))
예제 #7
0
def get_dataset_splits(dataset: str, config: str = None):
    module, module_hash = prepare_module(dataset)
    builder_cls = import_main_class(module)
    builder = builder_cls(hash=module_hash, name=config)
    splits = builder.info.splits.keys()
    return list(splits)
예제 #8
0
    def run(self):
        import apache_beam as beam

        if self._name is not None and self._all_configs:
            print(
                "Both parameters `name` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        dataset_module = dataset_module_factory(path)
        builder_cls = import_main_class(dataset_module.module_path)
        builders: List[DatasetBuilder] = []
        if self._beam_pipeline_options:
            beam_options = beam.options.pipeline_options.PipelineOptions(
                flags=[
                    f"--{opt.strip()}"
                    for opt in self._beam_pipeline_options.split(",") if opt
                ])
        else:
            beam_options = None
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for builder_config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(
                        name=builder_config.name,
                        data_dir=self._data_dir,
                        hash=dataset_module.hash,
                        beam_options=beam_options,
                        cache_dir=self._cache_dir,
                        base_path=dataset_module.builder_kwargs.get(
                            "base_path"),
                        namespace=dataset_module.builder_kwargs.get(
                            "namespace"),
                    ))
        else:
            builders.append(
                builder_cls(
                    name=name,
                    data_dir=self._data_dir,
                    beam_options=beam_options,
                    cache_dir=self._cache_dir,
                    base_path=dataset_module.builder_kwargs.get("base_path"),
                    namespace=dataset_module.builder_kwargs.get("namespace"),
                ))

        for builder in builders:
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                download_config=DownloadConfig(
                    cache_dir=config.DOWNLOADED_DATASETS_PATH),
                save_infos=self._save_infos,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )

        print("Apache beam run successful.")

        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(
                builder_cls.get_imported_module_dir(),
                config.DATASETDICT_INFOS_FILENAME)

            name = Path(path).name + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print(f"Dataset Infos file saved at {dataset_infos_path}")
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(
                dataset_dir, config.DATASETDICT_INFOS_FILENAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print(f"Dataset Infos file saved at {user_dataset_infos_path}")
예제 #9
0
    def run(self):
        module_path, hash = prepare_module(self._path_to_dataset)
        builder_cls = import_main_class(module_path)

        # use `None` as config if no configs
        configs = builder_cls.BUILDER_CONFIGS or [None]

        for config in configs:
            if config is None:
                name = None
                version = builder_cls.VERSION
            else:
                version = config.version
                name = config.name

            dataset_builder = builder_cls(name=name, hash=hash)
            mock_dl_manager = MockDownloadManager(
                dataset_name=self._dataset_name,
                config=config,
                version=version,
                is_local=True)

            dummy_data_folder = os.path.join(self._path_to_dataset,
                                             mock_dl_manager.dummy_data_folder)
            logger.info(
                f"Creating dummy folder structure for {dummy_data_folder}... ")
            os.makedirs(dummy_data_folder, exist_ok=True)

            try:
                generator_splits = dataset_builder._split_generators(
                    mock_dl_manager)
            except FileNotFoundError as e:

                print(
                    f"Dataset {self._dataset_name} with config {config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}."
                )

            files_to_create = set()
            split_names = []
            dummy_file_name = mock_dl_manager.dummy_file_name

            for split in generator_splits:
                logger.info(
                    f"Collecting dummy data file paths to create for {split.name}"
                )
                split_names.append(split.name)
                gen_kwargs = split.gen_kwargs
                generator = dataset_builder._generate_examples(**gen_kwargs)

                try:
                    dummy_data_guidance_print = "\n" + 30 * "=" + "DUMMY DATA INSTRUCTIONS" + 30 * "=" + "\n"
                    config_string = f"config {config.name} of " if config is not None else ""
                    dummy_data_guidance_print += (
                        "- In order to create the dummy data for " +
                        config_string +
                        f"{self._dataset_name}, please go into the folder '{dummy_data_folder}' with `cd {dummy_data_folder}` . \n\n"
                    )

                    # trigger generate function
                    for key, record in generator:
                        pass

                    dummy_data_guidance_print += f"- It appears that the function `_generate_examples(...)` expects one or more files in the folder {dummy_file_name} using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. \n\n"

                except FileNotFoundError as e:
                    files_to_create.add(e.filename)

            split_names = ", ".join(split_names)
            if len(files_to_create) > 0:
                # no glob.glob(...) in `_generate_examples(...)`
                if len(files_to_create) == 1 and next(
                        iter(files_to_create)) == dummy_file_name:
                    dummy_data_guidance_print += f"- Please create a single dummy data file called '{next(iter(files_to_create))}' from the folder '{dummy_data_folder}'. Make sure that the dummy data file provides at least one example for the split(s) '{split_names}' \n\n"
                    files_string = dummy_file_name
                else:
                    files_string = ", ".join(files_to_create)
                    dummy_data_guidance_print += f"- Please create the following dummy data files '{files_string}' from the folder '{dummy_data_folder}'\n\n"

                    dummy_data_guidance_print += f"- For each of the splits '{split_names}', make sure that one or more of the dummy data files provide at least one example \n\n"

                dummy_data_guidance_print += f"- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to '{files_string}'. In this case please refer to the `_generate_examples(...)` method \n\n"

            if len(files_to_create) == 1 and next(
                    iter(files_to_create)) == dummy_file_name:
                dummy_data_guidance_print += f"-After the dummy data file is created, it should be zipped to '{dummy_file_name}.zip' with the command `zip {dummy_file_name}.zip {dummy_file_name}` \n\n"

                dummy_data_guidance_print += (
                    f"-You can now delete the file '{dummy_file_name}' with the command `rm {dummy_file_name}` \n\n"
                )

                dummy_data_guidance_print += f"- To get the file '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n"
            else:
                dummy_data_guidance_print += f"-After all dummy data files are created, they should be zipped recursively to '{dummy_file_name}.zip' with the command `zip -r {dummy_file_name}.zip {dummy_file_name}/` \n\n"

                dummy_data_guidance_print += f"-You can now delete the folder '{dummy_file_name}' with the command `rm -r {dummy_file_name}` \n\n"

                dummy_data_guidance_print += f"- To get the folder '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n"

            dummy_data_guidance_print += (
                f"- Make sure you have created the file '{dummy_file_name}.zip' in '{dummy_data_folder}' \n"
            )

            dummy_data_guidance_print += 83 * "=" + "\n"

            print(dummy_data_guidance_print)
예제 #10
0
파일: test.py 프로젝트: anton-l/datasets
    def run(self):
        fl_logger().setLevel(ERROR)
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module = dataset_module_factory(path)
        builder_cls = import_main_class(module.module_path)
        n_builders = len(
            builder_cls.BUILDER_CONFIGS
        ) if self._all_configs and builder_cls.BUILDER_CONFIGS else 1

        def get_builders() -> Generator[DatasetBuilder, None, None]:
            if self._all_configs and builder_cls.BUILDER_CONFIGS:
                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
                    if "name" in module.builder_kwargs:
                        yield builder_cls(
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir,
                            **module.builder_kwargs,
                        )
                    else:
                        yield builder_cls(
                            name=config.name,
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir,
                            **module.builder_kwargs,
                        )
            else:
                if "name" in module.builder_kwargs:
                    yield builder_cls(cache_dir=self._cache_dir,
                                      data_dir=self._data_dir,
                                      **module.builder_kwargs)
                else:
                    yield builder_cls(name=name,
                                      cache_dir=self._cache_dir,
                                      data_dir=self._data_dir,
                                      **module.builder_kwargs)

        for j, builder in enumerate(get_builders()):
            print(
                f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})"
            )
            builder._record_infos = True
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

            # If save_infos=True, the dataset infos file is created next to the loaded module file.
            # Let's move it to the original directory of the dataset script, to allow the user to
            # upload them on S3 at the same time afterwards.
            if self._save_infos:
                dataset_infos_path = os.path.join(
                    builder_cls.get_imported_module_dir(),
                    datasets.config.DATASETDICT_INFOS_FILENAME)
                name = Path(path).name + ".py"
                combined_path = os.path.join(path, name)
                if os.path.isfile(path):
                    dataset_dir = os.path.dirname(path)
                elif os.path.isfile(combined_path):
                    dataset_dir = path
                elif os.path.isdir(
                        path
                ):  # for local directories containing only data files
                    dataset_dir = path
                else:  # in case of a remote dataset
                    dataset_dir = None
                    print(f"Dataset Infos file saved at {dataset_infos_path}")

                # Move dataset_info back to the user
                if dataset_dir is not None:
                    user_dataset_infos_path = os.path.join(
                        dataset_dir,
                        datasets.config.DATASETDICT_INFOS_FILENAME)
                    copyfile(dataset_infos_path, user_dataset_infos_path)
                    print(
                        f"Dataset Infos file saved at {user_dataset_infos_path}"
                    )

            # If clear_cache=True, the download folder and the dataset builder cache directory are deleted
            if self._clear_cache:
                if os.path.isdir(builder._cache_dir):
                    logger.warning(f"Clearing cache at {builder._cache_dir}")
                    rmtree(builder._cache_dir)
                download_dir = os.path.join(
                    self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR)
                if os.path.isdir(download_dir):
                    logger.warning(f"Clearing cache at {download_dir}")
                    rmtree(download_dir)

        print("Test successful.")
예제 #11
0
    def run(self):
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)
        builders: List[DatasetBuilder] = []
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(name=config.name,
                                hash=hash,
                                cache_dir=self._cache_dir,
                                data_dir=self._data_dir))
        else:
            builders.append(
                builder_cls(name=name,
                            hash=hash,
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir))

        for builder in builders:
            builder.download_and_prepare(
                download_mode=REUSE_CACHE_IF_EXISTS
                if not self._force_redownload else FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

        print("Test successful.")
        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(
                builder_cls.get_imported_module_dir(),
                DATASET_INFOS_DICT_FILE_NAME)

            name = Path(path).name + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print("Dataset Infos file saved at {}".format(
                    dataset_infos_path))
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(
                dataset_dir, DATASET_INFOS_DICT_FILE_NAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print("Dataset Infos file saved at {}".format(
                user_dataset_infos_path))