def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "text": datasets.Value("string"), "phonetic_detail": datasets.Sequence( { "start": datasets.Value("int64"), "stop": datasets.Value("int64"), "utterance": datasets.Value("string"), } ), "word_detail": datasets.Sequence( { "start": datasets.Value("int64"), "stop": datasets.Value("int64"), "utterance": datasets.Value("string"), } ), "dialect_region": datasets.Value("string"), "sentence_type": datasets.Value("string"), "speaker_id": datasets.Value("string"), "id": datasets.Value("string"), } ), supervised_keys=("file", "text"), homepage=_HOMEPAGE, citation=_CITATION, task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")], )
def _info(self): features = datasets.Features({ "utterance_id": datasets.Value("string"), "session": datasets.Value("string"), "test": datasets.Value("string"), "prompt": datasets.Value("string"), "transcript": datasets.Value("string"), "phonemes": datasets.Sequence(datasets.Value("string")), "correctness": datasets.Value("bool"), "aq_index": datasets.Value("float"), "duration_frames": datasets.Value("uint64"), "audio": datasets.Audio(sampling_rate=16_000) }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage="https://psst.study/", task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="filename", transcription_column="transcript") ], )
def _info(self): warnings.warn(""" This version of the Multilingual Librispeech dataset doesn't support streaming and is deprecated. You can download the latest one with >>> load_dataset(\"facebook/multilingual_librispeech\", \"polish\") """) return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "file": datasets.Value("string"), "audio": datasets.features.Audio(sampling_rate=16_000), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), }), supervised_keys=("file", "text"), homepage=_URL, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_column="audio", transcription_column="text") ], )
def _info(self): features = datasets.Features( { "client_id": datasets.Value("string"), "path": datasets.Value("string"), "sentence": datasets.Value("string"), "up_votes": datasets.Value("int64"), "down_votes": datasets.Value("int64"), "age": datasets.Value("string"), "gender": datasets.Value("string"), "accent": datasets.Value("string"), "locale": datasets.Value("string"), "segment": datasets.Value("string"), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence") ], )
def _info(self): features = datasets.Features({ "speaker_id": datasets.Value("string"), "age": datasets.Value("string"), "gender": datasets.ClassLabel(names=_SEX), "region_of_birth": datasets.ClassLabel(names=_REGIONS), "region_of_youth": datasets.ClassLabel(names=_REGIONS), "text": datasets.Value("string"), "path": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000) }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_URL, task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="text") ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "file": datasets.Value("string"), "audio": datasets.features.Audio(sampling_rate=16_000), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), }), supervised_keys=("file", "text"), homepage=_URL, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text") ], )
def test_column_mapping(self): task = AutomaticSpeechRecognition( audio_file_path_column="input_audio_file_path", transcription_column="input_transcription" ) self.assertDictEqual( {"input_audio_file_path": "audio_file_path", "input_transcription": "transcription"}, task.column_mapping )
def _info(self): return ds.DatasetInfo( description="", citation="", homepage="", license="", features=ds.Features( { "client_id": ds.Value("string"), "path": ds.Value("string"), "audio": ds.Audio(sampling_rate=48_000), "sentence": ds.Value("string"), "up_votes": ds.Value("int64"), "down_votes": ds.Value("int64"), "age": ds.Value("string"), "gender": ds.Value("string"), "accent": ds.Value("string"), "locale": ds.Value("string"), "segment": ds.Value("string"), } ), task_templates=[ AutomaticSpeechRecognition( audio_file_path_column="path", transcription_column="sentence" ) ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "speaker_id": datasets.Value("string"), "audio": datasets.features.Audio(sampling_rate=48_000), "file": datasets.Value("string"), "text": datasets.Value("string"), "text_id": datasets.Value("string"), "age": datasets.Value("string"), "gender": datasets.Value("string"), "accent": datasets.Value("string"), "region": datasets.Value("string"), "comment": datasets.Value("string"), }), supervised_keys=("file", "text"), homepage=_URL, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_column="audio", transcription_column="text") ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), "file": datasets.Value("string"), "text": datasets.Value("string"), "normalized_text": datasets.Value("string"), } ), supervised_keys=("file", "text"), homepage=_URL, citation=_CITATION, task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")], )
def _info(self): features = datasets.Features({ "path": datasets.Value("string"), "audio": datasets.features.Audio(sampling_rate=48_000), "sentence": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence") ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "file": datasets.Value("string"), "text": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=48_000), "phonetic": datasets.Value("string"), "orthographic": datasets.Value("string"), }), supervised_keys=("file", "text"), homepage=_URL, citation=_CITATION, task_templates=[ AutomaticSpeechRecognition(audio_column="audio", transcription_column="text") ], )
class Superb(datasets.GeneratorBasedBuilder): """Superb dataset.""" BUILDER_CONFIGS = [ SuperbConfig( name="asr", description=textwrap.dedent( """\ ASR transcribes utterances into words. While PR analyzes the improvement in modeling phonetics, ASR reflects the significance of the improvement in a real-world scenario. LibriSpeech train-clean-100/dev-clean/test-clean subsets are used for training/validation/testing. The evaluation metric is word error rate (WER).""" ), features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), } ), supervised_keys=("file", "text"), url="http://www.openslr.org/12", data_url="http://www.openslr.org/resources/12/", task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")], ), SuperbConfig( name="ks", description=textwrap.dedent( """\ Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and inference time are all crucial. SUPERB uses the widely used Speech Commands dataset v1.0 for the task. The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the false positive. The evaluation metric is accuracy (ACC)""" ), features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "label": datasets.ClassLabel( names=[ "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "_silence_", "_unknown_", ] ), } ), supervised_keys=("file", "label"), url="https://www.tensorflow.org/datasets/catalog/speech_commands", data_url="http://download.tensorflow.org/data/{filename}", ), SuperbConfig( name="ic", description=textwrap.dedent( """\ Intent Classification (IC) classifies utterances into predefined classes to determine the intent of speakers. SUPERB uses the Fluent Speech Commands dataset, where each utterance is tagged with three intent labels: action, object, and location. The evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "speaker_id": datasets.Value("string"), "text": datasets.Value("string"), "action": datasets.ClassLabel( names=["activate", "bring", "change language", "deactivate", "decrease", "increase"] ), "object": datasets.ClassLabel( names=[ "Chinese", "English", "German", "Korean", "heat", "juice", "lamp", "lights", "music", "newspaper", "none", "shoes", "socks", "volume", ] ), "location": datasets.ClassLabel(names=["bedroom", "kitchen", "none", "washroom"]), } ), supervised_keys=None, url="https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/", data_url="http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz", ), SuperbConfig( name="si", description=textwrap.dedent( """\ Speaker Identification (SI) classifies each utterance for its speaker identity as a multi-class classification, where speakers are in the same predefined set for both training and testing. The widely used VoxCeleb1 dataset is adopted, and the evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), # VoxCeleb1 contains 1251 speaker IDs in range ["id10001",..."id11251"] "label": datasets.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]), } ), supervised_keys=("file", "label"), url="https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html", ), SuperbConfig( name="sd", description=textwrap.dedent( """\ Speaker Diarization (SD) predicts `who is speaking when` for each timestamp, and multiple speakers can speak simultaneously. The model has to encode rich speaker characteristics for each frame and should be able to represent mixtures of signals. [LibriMix] is adopted where LibriSpeech train-clean-100/dev-clean/test-clean are used to generate mixtures for training/validation/testing. We focus on the two-speaker scenario as the first step. The time-coded speaker labels were generated using alignments from Kaldi LibriSpeech ASR model. The evaluation metric is diarization error rate (DER).""" ), features=datasets.Features( { "record_id": datasets.Value("string"), "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "start": datasets.Value("int64"), "end": datasets.Value("int64"), "speakers": [ { "speaker_id": datasets.Value("string"), "start": datasets.Value("int64"), "end": datasets.Value("int64"), } ], } ), # TODO supervised_keys=None, # TODO url="https://github.com/ftshijt/LibriMix", data_url="https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}", ), SuperbConfig( name="er", description=textwrap.dedent( """\ Emotion Recognition (ER) predicts an emotion class for each utterance. The most widely used ER dataset IEMOCAP is adopted, and we follow the conventional evaluation protocol: we drop the unbalanced emotion classes to leave the final four classes with a similar amount of data points and cross-validate on five folds of the standard splits. The evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), "audio": datasets.Audio(sampling_rate=16_000), "label": datasets.ClassLabel(names=["neu", "hap", "ang", "sad"]), } ), supervised_keys=("file", "label"), url="https://sail.usc.edu/iemocap/", ), ]
class Superb(datasets.GeneratorBasedBuilder): """Superb dataset.""" BUILDER_CONFIGS = [ SuperbConfig( name="asr", description=textwrap.dedent("""\ ASR transcribes utterances into words. While PR analyzes the improvement in modeling phonetics, ASR reflects the significance of the improvement in a real-world scenario. LibriSpeech train-clean-100/dev-clean/test-clean subsets are used for training/validation/testing. The evaluation metric is word error rate (WER)."""), url="http://www.openslr.org/12", data_url="http://www.openslr.org/resources/12/", task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text") ], ) ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "file": datasets.Value("string"), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), }), supervised_keys=("file", "text"), homepage=self.config.url, citation=_CITATION, task_templates=self.config.task_templates, ) def _split_generators(self, dl_manager): if self.config.name == "asr": _DL_URLS = { "dev": self.config.data_url + "dev-clean.tar.gz", "test": self.config.data_url + "test-clean.tar.gz", "train": self.config.data_url + "train-clean-100.tar.gz", } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), ] def _generate_examples(self, archive_path): """Generate examples.""" transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt") key = 0 for transcript_path in sorted(glob.glob(transcripts_glob)): transcript_dir_path = os.path.dirname(transcript_path) with open(transcript_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() id_, transcript = line.split(" ", 1) audio_file = f"{id_}.flac" speaker_id, chapter_id = [ int(el) for el in id_.split("-")[:2] ] yield key, { "id": id_, "speaker_id": speaker_id, "chapter_id": chapter_id, "file": os.path.join(transcript_dir_path, audio_file), "text": transcript, } key += 1
class Superb(datasets.GeneratorBasedBuilder): """Superb dataset.""" BUILDER_CONFIGS = [ SuperbConfig( name="asr", description=textwrap.dedent("""\ ASR transcribes utterances into words. While PR analyzes the improvement in modeling phonetics, ASR reflects the significance of the improvement in a real-world scenario. LibriSpeech train-clean-100/dev-clean/test-clean subsets are used for training/validation/testing. The evaluation metric is word error rate (WER)."""), features=datasets.Features({ "file": datasets.Value("string"), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), }), supervised_keys=("file", "text"), url="http://www.openslr.org/12", data_url="http://www.openslr.org/resources/12/", task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text") ], ), SuperbConfig( name="ks", description=textwrap.dedent("""\ Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and inference time are all crucial. SUPERB uses the widely used [Speech Commands dataset v1.0] for the task. The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the false positive. The evaluation metric is accuracy (ACC)"""), features=datasets.Features({ "file": datasets.Value("string"), "label": datasets.ClassLabel(names=[ "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "_silence_", "_unknown_", ]), }), supervised_keys=("file", "label"), url="https://www.tensorflow.org/datasets/catalog/speech_commands", data_url="http://download.tensorflow.org/data/{filename}", ), SuperbConfig( name="sd", description=textwrap.dedent("""\ Speaker Diarization (SD) predicts `who is speaking when` for each timestamp, and multiple speakers can speak simultaneously. The model has to encode rich speaker characteristics for each frame and should be able to represent mixtures of signals. [LibriMix] is adopted where LibriSpeech train-clean-100/dev-clean/test-clean are used to generate mixtures for training/validation/testing. We focus on the two-speaker scenario as the first step. The time-coded speaker labels were generated using alignments from Kaldi LibriSpeech ASR model. The evaluation metric is diarization error rate (DER).""" ), features=datasets.Features({ "record_id": datasets.Value("string"), "file": datasets.Value("string"), "start": datasets.Value("int64"), "end": datasets.Value("int64"), "speakers": [{ "speaker_id": datasets.Value("string"), "start": datasets.Value("int64"), "end": datasets.Value("int64"), }], }), # TODO supervised_keys=None, # TODO url="https://github.com/ftshijt/LibriMix", data_url= "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}", ), ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=self.config.features, supervised_keys=self.config.supervised_keys, homepage=self.config.url, citation=_CITATION, task_templates=self.config.task_templates, ) def _split_generators(self, dl_manager): if self.config.name == "asr": _DL_URLS = { "dev": self.config.data_url + "dev-clean.tar.gz", "test": self.config.data_url + "test-clean.tar.gz", "train": self.config.data_url + "train-clean-100.tar.gz", } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), ] elif self.config.name == "ks": _DL_URLS = { "train_val_test": self.config.data_url.format( filename="speech_commands_v0.01.tar.gz"), "test": self.config.data_url.format( filename="speech_commands_test_set_v0.01.tar.gz"), } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "archive_path": archive_path["train_val_test"], "split": "train" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "archive_path": archive_path["train_val_test"], "split": "val" }, ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={ "archive_path": archive_path["test"], "split": "test" }), ] elif self.config.name == "sd": splits = ["train", "dev", "test"] _DL_URLS = { split: { filename: self.config.data_url.format(split=split, filename=filename) for filename in ["reco2dur", "segments", "utt2spk", "wav.zip"] } for split in splits } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator(name=datasets.NamedSplit(split), gen_kwargs={ "archive_path": archive_path[split], "split": split }) for split in splits ] def _generate_examples(self, archive_path, split=None): """Generate examples.""" if self.config.name == "asr": transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt") key = 0 for transcript_path in sorted(glob.glob(transcripts_glob)): transcript_dir_path = os.path.dirname(transcript_path) with open(transcript_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() id_, transcript = line.split(" ", 1) audio_file = f"{id_}.flac" speaker_id, chapter_id = [ int(el) for el in id_.split("-")[:2] ] yield key, { "id": id_, "speaker_id": speaker_id, "chapter_id": chapter_id, "file": os.path.join(transcript_dir_path, audio_file), "text": transcript, } key += 1 elif self.config.name == "ks": words = [ "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go" ] splits = _split_ks_files(archive_path, split) for key, audio_file in enumerate(sorted(splits[split])): base_dir, file_name = os.path.split(audio_file) _, word = os.path.split(base_dir) if word in words: label = word elif word == "_silence_" or word == "_background_noise_": label = "_silence_" else: label = "_unknown_" yield key, {"file": audio_file, "label": label} elif self.config.name == "sd": data = SdData(archive_path) args = SdArgs() chunk_indices = _generate_chunk_indices(data, args, split=split) if split != "test": for key, (rec, st, ed) in enumerate(chunk_indices): speakers = _get_speakers(rec, data, args) yield key, { "record_id": rec, "file": data.wavs[rec], "start": st, "end": ed, "speakers": speakers, } else: key = 0 for rec in chunk_indices: for rec, st, ed in chunk_indices[rec]: speakers = _get_speakers(rec, data, args) yield key, { "record_id": rec, "file": data.wavs[rec], "start": st, "end": ed, "speakers": speakers, } key += 1
class Superb(datasets.GeneratorBasedBuilder): """Superb dataset.""" BUILDER_CONFIGS = [ SuperbConfig( name="asr", description=textwrap.dedent( """\ ASR transcribes utterances into words. While PR analyzes the improvement in modeling phonetics, ASR reflects the significance of the improvement in a real-world scenario. LibriSpeech train-clean-100/dev-clean/test-clean subsets are used for training/validation/testing. The evaluation metric is word error rate (WER).""" ), features=datasets.Features( { "file": datasets.Value("string"), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), } ), supervised_keys=("file", "text"), url="http://www.openslr.org/12", data_url="http://www.openslr.org/resources/12/", task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")], ), SuperbConfig( name="ks", description=textwrap.dedent( """\ Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and inference time are all crucial. SUPERB uses the widely used Speech Commands dataset v1.0 for the task. The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the false positive. The evaluation metric is accuracy (ACC)""" ), features=datasets.Features( { "file": datasets.Value("string"), "label": datasets.ClassLabel( names=[ "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "_silence_", "_unknown_", ] ), } ), supervised_keys=("file", "label"), url="https://www.tensorflow.org/datasets/catalog/speech_commands", data_url="http://download.tensorflow.org/data/{filename}", ), SuperbConfig( name="ic", description=textwrap.dedent( """\ Intent Classification (IC) classifies utterances into predefined classes to determine the intent of speakers. SUPERB uses the Fluent Speech Commands dataset, where each utterance is tagged with three intent labels: action, object, and location. The evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), "speaker_id": datasets.Value("string"), "text": datasets.Value("string"), "action": datasets.ClassLabel( names=["activate", "bring", "change language", "deactivate", "decrease", "increase"] ), "object": datasets.ClassLabel( names=[ "Chinese", "English", "German", "Korean", "heat", "juice", "lamp", "lights", "music", "newspaper", "none", "shoes", "socks", "volume", ] ), "location": datasets.ClassLabel(names=["bedroom", "kitchen", "none", "washroom"]), } ), supervised_keys=None, url="https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/", data_url="http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz", ), SuperbConfig( name="si", description=textwrap.dedent( """\ Speaker Identification (SI) classifies each utterance for its speaker identity as a multi-class classification, where speakers are in the same predefined set for both training and testing. The widely used VoxCeleb1 dataset is adopted, and the evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), # VoxCeleb1 contains 1251 speaker IDs in range ["id10001",..."id11251"] "label": datasets.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]), } ), supervised_keys=("file", "label"), url="https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html", ), SuperbConfig( name="sd", description=textwrap.dedent( """\ Speaker Diarization (SD) predicts `who is speaking when` for each timestamp, and multiple speakers can speak simultaneously. The model has to encode rich speaker characteristics for each frame and should be able to represent mixtures of signals. [LibriMix] is adopted where LibriSpeech train-clean-100/dev-clean/test-clean are used to generate mixtures for training/validation/testing. We focus on the two-speaker scenario as the first step. The time-coded speaker labels were generated using alignments from Kaldi LibriSpeech ASR model. The evaluation metric is diarization error rate (DER).""" ), features=datasets.Features( { "record_id": datasets.Value("string"), "file": datasets.Value("string"), "start": datasets.Value("int64"), "end": datasets.Value("int64"), "speakers": [ { "speaker_id": datasets.Value("string"), "start": datasets.Value("int64"), "end": datasets.Value("int64"), } ], } ), # TODO supervised_keys=None, # TODO url="https://github.com/ftshijt/LibriMix", data_url="https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}", ), SuperbConfig( name="er", description=textwrap.dedent( """\ Emotion Recognition (ER) predicts an emotion class for each utterance. The most widely used ER dataset IEMOCAP is adopted, and we follow the conventional evaluation protocol: we drop the unbalanced emotion classes to leave the final four classes with a similar amount of data points and cross-validate on five folds of the standard splits. The evaluation metric is accuracy (ACC).""" ), features=datasets.Features( { "file": datasets.Value("string"), "label": datasets.ClassLabel(names=["neu", "hap", "ang", "sad"]), } ), supervised_keys=("file", "label"), url="https://sail.usc.edu/iemocap/", ), ] @property def manual_download_instructions(self): if self.config.name == "si": return textwrap.dedent( """ Please download the VoxCeleb dataset using the following script, which should create `VoxCeleb1/wav/id*` directories for both train and test speakers`: ``` mkdir VoxCeleb1 cd VoxCeleb1 wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad cat vox1_dev* > vox1_dev_wav.zip unzip vox1_dev_wav.zip wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip unzip vox1_test_wav.zip # download the official SUPERB train-dev-test split wget https://raw.githubusercontent.com/s3prl/s3prl/master/s3prl/downstream/voxceleb1/veri_test_class.txt ```""" ) elif self.config.name == "er": return textwrap.dedent( """ Please download the IEMOCAP dataset after submitting the request form here: https://sail.usc.edu/iemocap/iemocap_release.htm Having downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz` which should create a folder called `IEMOCAP_full_release` """ ) return None def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=self.config.features, supervised_keys=self.config.supervised_keys, homepage=self.config.url, citation=_CITATION, task_templates=self.config.task_templates, ) def _split_generators(self, dl_manager): if self.config.name == "asr": _DL_URLS = { "dev": self.config.data_url + "dev-clean.tar.gz", "test": self.config.data_url + "test-clean.tar.gz", "train": self.config.data_url + "train-clean-100.tar.gz", } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]} ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), ] elif self.config.name == "ks": _DL_URLS = { "train_val_test": self.config.data_url.format(filename="speech_commands_v0.01.tar.gz"), "test": self.config.data_url.format(filename="speech_commands_test_set_v0.01.tar.gz"), } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "train"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "val"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"], "split": "test"} ), ] elif self.config.name == "ic": archive_path = dl_manager.download_and_extract(self.config.data_url) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path, "split": "train"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path, "split": "valid"}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path, "split": "test"} ), ] elif self.config.name == "si": manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"archive_path": manual_dir, "split": 1}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": manual_dir, "split": 2}, ), datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": manual_dir, "split": 3}), ] elif self.config.name == "sd": splits = ["train", "dev", "test"] _DL_URLS = { split: { filename: self.config.data_url.format(split=split, filename=filename) for filename in ["reco2dur", "segments", "utt2spk", "wav.zip"] } for split in splits } archive_path = dl_manager.download_and_extract(_DL_URLS) return [ datasets.SplitGenerator( name=datasets.NamedSplit(split), gen_kwargs={"archive_path": archive_path[split], "split": split} ) for split in splits ] elif self.config.name == "er": manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) return [ datasets.SplitGenerator( name=f"session{i}", gen_kwargs={"archive_path": manual_dir, "split": i}, ) for i in range(1, 6) ] def _generate_examples(self, archive_path, split=None): """Generate examples.""" if self.config.name == "asr": transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt") key = 0 for transcript_path in sorted(glob.glob(transcripts_glob)): transcript_dir_path = os.path.dirname(transcript_path) with open(transcript_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() id_, transcript = line.split(" ", 1) audio_file = f"{id_}.flac" speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] yield key, { "id": id_, "speaker_id": speaker_id, "chapter_id": chapter_id, "file": os.path.join(transcript_dir_path, audio_file), "text": transcript, } key += 1 elif self.config.name == "ks": words = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"] splits = _split_ks_files(archive_path, split) for key, audio_file in enumerate(sorted(splits[split])): base_dir, file_name = os.path.split(audio_file) _, word = os.path.split(base_dir) if word in words: label = word elif word == "_silence_" or word == "_background_noise_": label = "_silence_" else: label = "_unknown_" yield key, {"file": audio_file, "label": label} elif self.config.name == "ic": root_path = os.path.join(archive_path, "fluent_speech_commands_dataset/") csv_path = os.path.join(root_path, f"data/{split}_data.csv") with open(csv_path, encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True) next(csv_reader) for row in csv_reader: key, file_path, speaker_id, text, action, object_, location = row yield key, { "file": os.path.join(root_path, file_path), "speaker_id": speaker_id, "text": text, "action": action, "object": object_, "location": location, } elif self.config.name == "si": wav_path = os.path.join(archive_path, "wav/") splits_path = os.path.join(archive_path, "veri_test_class.txt") with open(splits_path, "r", encoding="utf-8") as f: for key, line in enumerate(f): split_id, file_path = line.strip().split(" ") if int(split_id) != split: continue speaker_id = file_path.split("/")[0] yield key, { "file": os.path.join(wav_path, file_path), "label": speaker_id, } elif self.config.name == "sd": data = SdData(archive_path) args = SdArgs() chunk_indices = _generate_chunk_indices(data, args, split=split) if split != "test": for key, (rec, st, ed) in enumerate(chunk_indices): speakers = _get_speakers(rec, data, args) yield key, { "record_id": rec, "file": data.wavs[rec], "start": st, "end": ed, "speakers": speakers, } else: key = 0 for rec in chunk_indices: for rec, st, ed in chunk_indices[rec]: speakers = _get_speakers(rec, data, args) yield key, { "record_id": rec, "file": data.wavs[rec], "start": st, "end": ed, "speakers": speakers, } key += 1 elif self.config.name == "er": root_path = os.path.join(archive_path, f"Session{split}/") wav_path = os.path.join(root_path, "sentences/wav/") labels_path = os.path.join(root_path, "dialog/EmoEvaluation/*.txt") emotions = ["neu", "hap", "ang", "sad", "exc"] key = 0 for labels_file in sorted(glob.glob(labels_path)): with open(labels_file, "r", encoding="utf-8") as f: for line in f: if line[0] != "[": continue _, filename, emo, _ = line.split("\t") if emo not in emotions: continue wav_subdir = filename.rsplit("_", 1)[0] filename = f"{filename}.wav" yield key, { "file": os.path.join(wav_path, wav_subdir, filename), "label": emo.replace("exc", "hap"), } key += 1
class Superb(ds.GeneratorBasedBuilder): BUILDER_CONFIGS = [ SuperbConfig( name="asr", features=ds.Features({ "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "text": ds.Value("string"), "speaker_id": ds.Value("int64"), "chapter_id": ds.Value("int64"), "id": ds.Value("string"), }), supervised_keys=("file", "text"), data_url="http://www.openslr.org/resources/12/", task_templates=[ AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text") ], ), SuperbConfig( name="ks", features=ds.Features({ "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "label": ds.ClassLabel(names=[ "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "_silence_", "_unknown_", ]), }), supervised_keys=("file", "label"), data_url="http://download.tensorflow.org/data/{filename}", ), SuperbConfig( name="ic", features=ds.Features({ "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "speaker_id": ds.Value("string"), "text": ds.Value("string"), "action": ds.ClassLabel(names=[ "activate", "bring", "change language", "deactivate", "decrease", "increase", ]), "object": ds.ClassLabel(names=[ "Chinese", "English", "German", "Korean", "heat", "juice", "lamp", "lights", "music", "newspaper", "none", "shoes", "socks", "volume", ]), "location": ds.ClassLabel( names=["bedroom", "kitchen", "none", "washroom"]), }), supervised_keys=None, data_url= "http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz", ), SuperbConfig( name="si", features=ds.Features({ "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "label": ds.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]), }), supervised_keys=("file", "label"), ), SuperbConfig( name="sd", features=ds.Features({ "record_id": ds.Value("string"), "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "start": ds.Value("int64"), "end": ds.Value("int64"), "speakers": [{ "speaker_id": ds.Value("string"), "start": ds.Value("int64"), "end": ds.Value("int64"), }], }), supervised_keys=None, data_url= "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}", ), SuperbConfig( name="er", features=ds.Features({ "file": ds.Value("string"), "audio": ds.Audio(sampling_rate=16_000), "label": ds.ClassLabel(names=["neu", "hap", "ang", "sad"]), }), supervised_keys=("file", "label"), ), ]