def test_load_dataset_builder_for_absolute_data_dir(complex_data_dir): builder = datasets.load_dataset_builder(complex_data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" assert builder.config.name == Path(complex_data_dir).name assert isinstance(builder.config.data_files, DataFilesDict) assert len(builder.config.data_files["train"]) > 0
def check_existence(cls, ner_dataset: str, ner_dataset_subset: str = "") -> Tuple[bool, str]: """ checks if ner_dataset exists in huggingface datasets Args: ner_dataset: e.g. "conll2003" ner_dataset_subset: e.g. "simple_cased" Returns: existence: True if ner_dataset exists in huggingface datasets, False otherwise """ try: _ = load_dataset_builder( ner_dataset, name=ner_dataset_subset if len(ner_dataset_subset) else None, ) return True, "" except ValueError as e: return ( False, f"Error! config name is missing for ner_dataset = {ner_dataset} " f"(ner_dataset_subset = {ner_dataset_subset})! Error message: {e}", ) except FileNotFoundError: return False, f"Error! ner_dataset = {ner_dataset} unknown."
def test_load_dataset_builder_for_absolute_script_dir( dataset_loading_script_dir, data_dir): builder = datasets.load_dataset_builder(dataset_loading_script_dir, data_dir=data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == DATASET_LOADING_SCRIPT_NAME assert builder.info.features == Features({"text": Value("string")})
def test_load_dataset_builder_for_community_dataset_without_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.split("/")[-1] assert isinstance(builder.config.data_files, list) assert len(builder.config.data_files) > 0
def resolve_dataset(args, task: str): import datasets builder = datasets.load_dataset_builder( args.dataset_name, use_auth_token=args.token ) if args.dataset_config is None: args.dataset_config = builder.config_id print(f"Inferred dataset_config {args.dataset_config}") splits = builder.info.splits if splits is not None: if args.dataset_split not in splits: raise ValueError( f"The split `{args.dataset_split}` is not a valid split, please choose from {','.join(splits.keys())}" ) task_templates = builder.info.task_templates if task_templates is not None: for task_template in task_templates: if task_template.task == task: args.dataset_column = task_template.audio_file_path_column print(f"Inferred dataset_column {args.dataset_column}") return ( args.dataset_name, args.dataset_config, args.dataset_split, args.dataset_column, )
def test_load_dataset_builder_for_community_dataset_without_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.replace( "/", "___") assert isinstance(builder.config.data_files, DataFilesDict) assert len(builder.config.data_files["train"]) > 0
def test_load_dataset_builder_for_relative_script_dir(dataset_loading_script_dir, data_dir): with set_current_working_directory_to_temp_dir(): relative_script_dir = DATASET_LOADING_SCRIPT_NAME shutil.copytree(dataset_loading_script_dir, relative_script_dir) builder = datasets.load_dataset_builder(relative_script_dir, data_dir=data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == DATASET_LOADING_SCRIPT_NAME assert builder.info.features == Features({"text": Value("string")})
def test_load_dataset_builder_for_community_dataset_with_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER) assert isinstance(builder, DatasetBuilder) assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1] assert builder.info.features == Features({"text": Value("string")}) namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")] assert builder._relative_data_dir().startswith(namespace) assert SAMPLE_DATASET_IDENTIFIER.replace("/", "___") in builder.__module__
def test_load_dataset_builder_for_community_dataset_without_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.split("/")[-1] namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")] assert builder._relative_data_dir().startswith(namespace) assert isinstance(builder.config.data_files, list) assert len(builder.config.data_files) > 0
def test_load_dataset_builder_for_relative_data_dir(complex_data_dir): with set_current_working_directory_to_temp_dir(): relative_data_dir = "relative_data_dir" shutil.copytree(complex_data_dir, relative_data_dir) builder = datasets.load_dataset_builder(relative_data_dir) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" assert builder.config.name == relative_data_dir assert isinstance(builder.config.data_files, DataFilesDict) assert len(builder.config.data_files["train"]) > 0
def test_load_dataset_builder_fail(): with pytest.raises(FileNotFoundError): datasets.load_dataset_builder("blabla")
def get_infos( cls, ner_dataset: str, ner_dataset_subset: str = "", ) -> Tuple[bool, Optional[List[str]], Optional[bool], Optional[Dict[str, Any]]]: """ get all relevant infos about dataset Args: ner_dataset: e.g. "conll2003" ner_dataset_subset: e.g. "simple_cased" Returns: implementation: True if ner_dataset is implemented, False otherwise tags: e.g. ["O", "B-LOC", "B-MISC", "B-ORG", "B-PER", "I-LOC", "I-MISC", "I-ORG", "I-PER"] pretokenized: e.g. True lookup_table: e.g. {'text': 'tokens', 'tags': 'ner_tags', 'mapping': None} e.g. {'text': 'sentence', 'tags': 'entities', 'mapping': {..}} """ dataset_builder = load_dataset_builder( ner_dataset, name=ner_dataset_subset if len(ner_dataset_subset) else None) if dataset_builder.info.features is None: return False, None, None, None else: feat = dict(dataset_builder.info.features) implementation: bool = False tags: Optional[List[str]] = None pretokenized: Optional[bool] = None lookup_table: Optional[Dict[str, Any]] = None try: if "ner_tags" in feat: # e.g. conll2003 keys = ["tokens", "ner_tags"] if all([key in feat for key in keys]): implementation = True tags = feat["ner_tags"].feature.names pretokenized = True lookup_table = { "text": "tokens", "tags": "ner_tags", "mapping": None, } elif "entities" in feat: # e.g. ehealth_kd keys = ["sentence", "entities"] if all([key in feat for key in keys]): entities_keys = [ "ent_text", "ent_label", "start_character", "end_character", ] if all([ entities_key in feat["entities"][0] for entities_key in entities_keys ]): implementation = True tags = feat["entities"][0]["ent_label"].names pretokenized = False lookup_table = { "text": "sentence", "tags": "entities", "mapping": { "ent_text": "token", "ent_label": "tag", "start_character": "char_start", "end_character": "char_end", }, } else: return False, None, None, None except Exception: return False, None, None, None if (implementation is False or tags is None or pretokenized is None or lookup_table is None): return False, None, None, None else: return implementation, tags, pretokenized, lookup_table
def test_load_dataset_builder_for_community_dataset_with_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER) assert isinstance(builder, DatasetBuilder) assert builder.name == SAMPLE_DATASET_IDENTIFIER.split("/")[-1] assert builder.info.features == Features({"text": Value("string")})