예제 #1
0
파일: test.py 프로젝트: wjj962464/nlp-1
    def run(self):
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)
        builders: List[DatasetBuilder] = []
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(name=config.name,
                                hash=hash,
                                cache_dir=self._cache_dir,
                                data_dir=self._data_dir))
        else:
            builders.append(
                builder_cls(name=name,
                            hash=hash,
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir))

        for builder in builders:
            builder.download_and_prepare(
                download_mode=REUSE_CACHE_IF_EXISTS
                if not self._force_redownload else FORCE_REDOWNLOAD,
                save_infos=self._save_infos,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )

        print("Test successful.")
        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(
                builder_cls.get_imported_module_dir(),
                DATASET_INFOS_DICT_FILE_NAME)

            name = list(filter(lambda x: x, path.split("/")))[-1] + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print("Dataset Infos file saved at {}".format(
                    dataset_infos_path))
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(
                dataset_dir, DATASET_INFOS_DICT_FILE_NAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print("Dataset Infos file saved at {}".format(
                user_dataset_infos_path))
예제 #2
0
    def test_dataset_info_available(self, dataset, config_name):

        with TemporaryDirectory() as tmp_dir:
            local_module_path, local_hash = prepare_module(
                os.path.join("datasets", dataset), dataset=True, cache_dir=tmp_dir, local_files_only=True
            )

            builder_cls = import_main_class(local_module_path, dataset=True)

            builder_instance: DatasetBuilder = builder_cls(
                cache_dir=tmp_dir, name=config_name, hash=local_hash,
            )

            dataset_info_url = os.path.join(
                HF_GCP_BASE_URL, builder_instance._relative_data_dir(with_hash=False), DATASET_INFO_FILENAME
            )
            datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir)
            self.assertTrue(os.path.exists(datset_info_path))
예제 #3
0
    def run(self):
        module_path, hash = prepare_module(self._path_to_dataset)
        builder_cls = import_main_class(module_path)

        # use `None` as config if no configs
        configs = builder_cls.BUILDER_CONFIGS or [None]

        for config in configs:
            if config is None:
                name = None
                version = builder_cls.VERSION
            else:
                version = config.version
                name = config.name

            dataset_builder = builder_cls(name=name, hash=hash)
            mock_dl_manager = MockDownloadManager(
                dataset_name=self._dataset_name, config=config, version=version, is_local=True
            )

            dummy_data_folder = os.path.join(self._path_to_dataset, mock_dl_manager.dummy_data_folder)
            logger.info(f"Creating dummy folder structure for {dummy_data_folder}... ")
            os.makedirs(dummy_data_folder, exist_ok=True)

            try:
                generator_splits = dataset_builder._split_generators(mock_dl_manager)
            except FileNotFoundError as e:

                print(
                    f"Dataset {self._dataset_name} with config {config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}."
                )

            files_to_create = set()
            split_names = []
            dummy_file_name = mock_dl_manager.dummy_file_name

            for split in generator_splits:
                logger.info(f"Collecting dummy data file paths to create for {split.name}")
                split_names.append(split.name)
                gen_kwargs = split.gen_kwargs
                generator = dataset_builder._generate_examples(**gen_kwargs)

                try:
                    dummy_data_guidance_print = "\n" + 30 * "=" + "DUMMY DATA INSTRUCTIONS" + 30 * "=" + "\n"
                    config_string = f"config {config.name} of " if config is not None else ""
                    dummy_data_guidance_print += (
                        "- In order to create the dummy data for "
                        + config_string
                        + f"{self._dataset_name}, please go into the folder '{dummy_data_folder}' with `cd {dummy_data_folder}` . \n\n"
                    )

                    # trigger generate function
                    for key, record in generator:
                        pass

                    dummy_data_guidance_print += f"- It appears that the function `_generate_examples(...)` expects one or more files in the folder {dummy_file_name} using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. \n\n"

                except FileNotFoundError as e:
                    files_to_create.add(e.filename)

            split_names = ", ".join(split_names)
            if len(files_to_create) > 0:
                # no glob.glob(...) in `_generate_examples(...)`
                if len(files_to_create) == 1 and next(iter(files_to_create)) == dummy_file_name:
                    dummy_data_guidance_print += f"- Please create a single dummy data file called '{next(iter(files_to_create))}' from the folder '{dummy_data_folder}'. Make sure that the dummy data file provides at least one example for the split(s) '{split_names}' \n\n"
                    files_string = dummy_file_name
                else:
                    files_string = ", ".join(files_to_create)
                    dummy_data_guidance_print += f"- Please create the following dummy data files '{files_string}' from the folder '{dummy_data_folder}'\n\n"

                    dummy_data_guidance_print += f"- For each of the splits '{split_names}', make sure that one or more of the dummy data files provide at least one example \n\n"

                dummy_data_guidance_print += f"- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to '{files_string}'. In this case please refer to the `_generate_examples(...)` method \n\n"

            if len(files_to_create) == 1 and next(iter(files_to_create)) == dummy_file_name:
                dummy_data_guidance_print += f"-After the dummy data file is created, it should be zipped to '{dummy_file_name}.zip' with the command `zip {dummy_file_name}.zip {dummy_file_name}` \n\n"

                dummy_data_guidance_print += (
                    f"-You can now delete the file '{dummy_file_name}' with the command `rm {dummy_file_name}` \n\n"
                )

                dummy_data_guidance_print += f"- To get the file '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n"
            else:
                dummy_data_guidance_print += f"-After all dummy data files are created, they should be zipped recursively to '{dummy_file_name}.zip' with the command `zip -r {dummy_file_name}.zip {dummy_file_name}/` \n\n"

                dummy_data_guidance_print += f"-You can now delete the folder '{dummy_file_name}' with the command `rm -r {dummy_file_name}` \n\n"

                dummy_data_guidance_print += f"- To get the folder '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n"

            dummy_data_guidance_print += (
                f"- Make sure you have created the file '{dummy_file_name}.zip' in '{dummy_data_folder}' \n"
            )

            dummy_data_guidance_print += 83 * "=" + "\n"

            print(dummy_data_guidance_print)