예제 #1
0
def test_load_dataset_builder_for_absolute_data_dir(complex_data_dir):
    builder = datasets.load_dataset_builder(complex_data_dir)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == "text"
    assert builder.config.name == Path(complex_data_dir).name
    assert isinstance(builder.config.data_files, DataFilesDict)
    assert len(builder.config.data_files["train"]) > 0
예제 #2
0
    def check_existence(cls,
                        ner_dataset: str,
                        ner_dataset_subset: str = "") -> Tuple[bool, str]:
        """
        checks if ner_dataset exists in huggingface datasets

        Args:
            ner_dataset: e.g. "conll2003"
            ner_dataset_subset: e.g. "simple_cased"

        Returns:
            existence: True if ner_dataset exists in huggingface datasets, False otherwise
        """
        try:
            _ = load_dataset_builder(
                ner_dataset,
                name=ner_dataset_subset if len(ner_dataset_subset) else None,
            )
            return True, ""
        except ValueError as e:
            return (
                False,
                f"Error! config name is missing for ner_dataset = {ner_dataset} "
                f"(ner_dataset_subset = {ner_dataset_subset})! Error message: {e}",
            )
        except FileNotFoundError:
            return False, f"Error! ner_dataset = {ner_dataset} unknown."
예제 #3
0
def test_load_dataset_builder_for_absolute_script_dir(
        dataset_loading_script_dir, data_dir):
    builder = datasets.load_dataset_builder(dataset_loading_script_dir,
                                            data_dir=data_dir)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == DATASET_LOADING_SCRIPT_NAME
    assert builder.info.features == Features({"text": Value("string")})
예제 #4
0
def test_load_dataset_builder_for_community_dataset_without_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == "text"
    assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.split("/")[-1]
    assert isinstance(builder.config.data_files, list)
    assert len(builder.config.data_files) > 0
예제 #5
0
def resolve_dataset(args, task: str):
    import datasets

    builder = datasets.load_dataset_builder(
        args.dataset_name, use_auth_token=args.token
    )

    if args.dataset_config is None:
        args.dataset_config = builder.config_id
        print(f"Inferred dataset_config {args.dataset_config}")

    splits = builder.info.splits
    if splits is not None:
        if args.dataset_split not in splits:
            raise ValueError(
                f"The split `{args.dataset_split}` is not a valid split, please choose from {','.join(splits.keys())}"
            )

    task_templates = builder.info.task_templates
    if task_templates is not None:
        for task_template in task_templates:
            if task_template.task == task:
                args.dataset_column = task_template.audio_file_path_column
                print(f"Inferred dataset_column {args.dataset_column}")
    return (
        args.dataset_name,
        args.dataset_config,
        args.dataset_split,
        args.dataset_column,
    )
예제 #6
0
def test_load_dataset_builder_for_community_dataset_without_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == "text"
    assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.replace(
        "/", "___")
    assert isinstance(builder.config.data_files, DataFilesDict)
    assert len(builder.config.data_files["train"]) > 0
예제 #7
0
def test_load_dataset_builder_for_relative_script_dir(dataset_loading_script_dir, data_dir):
    with set_current_working_directory_to_temp_dir():
        relative_script_dir = DATASET_LOADING_SCRIPT_NAME
        shutil.copytree(dataset_loading_script_dir, relative_script_dir)
        builder = datasets.load_dataset_builder(relative_script_dir, data_dir=data_dir)
        assert isinstance(builder, DatasetBuilder)
        assert builder.name == DATASET_LOADING_SCRIPT_NAME
        assert builder.info.features == Features({"text": Value("string")})
예제 #8
0
def test_load_dataset_builder_for_community_dataset_with_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1]
    assert builder.info.features == Features({"text": Value("string")})
    namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")]
    assert builder._relative_data_dir().startswith(namespace)
    assert SAMPLE_DATASET_IDENTIFIER.replace("/", "___") in builder.__module__
예제 #9
0
def test_load_dataset_builder_for_community_dataset_without_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == "text"
    assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.split("/")[-1]
    namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")]
    assert builder._relative_data_dir().startswith(namespace)
    assert isinstance(builder.config.data_files, list)
    assert len(builder.config.data_files) > 0
예제 #10
0
def test_load_dataset_builder_for_relative_data_dir(complex_data_dir):
    with set_current_working_directory_to_temp_dir():
        relative_data_dir = "relative_data_dir"
        shutil.copytree(complex_data_dir, relative_data_dir)
        builder = datasets.load_dataset_builder(relative_data_dir)
        assert isinstance(builder, DatasetBuilder)
        assert builder.name == "text"
        assert builder.config.name == relative_data_dir
        assert isinstance(builder.config.data_files, DataFilesDict)
        assert len(builder.config.data_files["train"]) > 0
예제 #11
0
def test_load_dataset_builder_fail():
    with pytest.raises(FileNotFoundError):
        datasets.load_dataset_builder("blabla")
예제 #12
0
    def get_infos(
        cls,
        ner_dataset: str,
        ner_dataset_subset: str = "",
    ) -> Tuple[bool, Optional[List[str]], Optional[bool], Optional[Dict[str,
                                                                        Any]]]:
        """
        get all relevant infos about dataset

        Args:
            ner_dataset: e.g. "conll2003"
            ner_dataset_subset: e.g. "simple_cased"

        Returns:
            implementation: True if ner_dataset is implemented, False otherwise
            tags: e.g. ["O", "B-LOC", "B-MISC", "B-ORG", "B-PER", "I-LOC", "I-MISC", "I-ORG", "I-PER"]
            pretokenized: e.g. True
            lookup_table: e.g. {'text': 'tokens', 'tags': 'ner_tags', 'mapping': None}
                          e.g. {'text': 'sentence', 'tags': 'entities', 'mapping': {..}}
        """
        dataset_builder = load_dataset_builder(
            ner_dataset,
            name=ner_dataset_subset if len(ner_dataset_subset) else None)
        if dataset_builder.info.features is None:
            return False, None, None, None
        else:
            feat = dict(dataset_builder.info.features)

        implementation: bool = False
        tags: Optional[List[str]] = None
        pretokenized: Optional[bool] = None
        lookup_table: Optional[Dict[str, Any]] = None
        try:
            if "ner_tags" in feat:  # e.g. conll2003
                keys = ["tokens", "ner_tags"]
                if all([key in feat for key in keys]):
                    implementation = True
                    tags = feat["ner_tags"].feature.names
                    pretokenized = True
                    lookup_table = {
                        "text": "tokens",
                        "tags": "ner_tags",
                        "mapping": None,
                    }
            elif "entities" in feat:  # e.g. ehealth_kd
                keys = ["sentence", "entities"]
                if all([key in feat for key in keys]):
                    entities_keys = [
                        "ent_text",
                        "ent_label",
                        "start_character",
                        "end_character",
                    ]
                    if all([
                            entities_key in feat["entities"][0]
                            for entities_key in entities_keys
                    ]):
                        implementation = True
                        tags = feat["entities"][0]["ent_label"].names
                        pretokenized = False
                        lookup_table = {
                            "text": "sentence",
                            "tags": "entities",
                            "mapping": {
                                "ent_text": "token",
                                "ent_label": "tag",
                                "start_character": "char_start",
                                "end_character": "char_end",
                            },
                        }
            else:
                return False, None, None, None
        except Exception:
            return False, None, None, None

        if (implementation is False or tags is None or pretokenized is None
                or lookup_table is None):
            return False, None, None, None
        else:
            return implementation, tags, pretokenized, lookup_table
예제 #13
0
def test_load_dataset_builder_for_community_dataset_with_script():
    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER)
    assert isinstance(builder, DatasetBuilder)
    assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1]
    assert builder.info.features == Features({"text": Value("string")})