def prepare_musan( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ('music', 'speech', 'noise'), use_vocals: bool = True, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if not parts: raise ValueError("No MUSAN parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = {} if 'music' in parts: manifests['music'] = prepare_music(corpus_dir, use_vocals=use_vocals) validate_recordings_and_supervisions(**manifests['music']) if 'speech' in parts: manifests['speech'] = {'recordings': scan_recordings(corpus_dir / 'speech')} validate(manifests['speech']['recordings']) if 'noise' in parts: manifests['noise'] = {'recordings': scan_recordings(corpus_dir / 'noise')} validate(manifests['noise']['recordings']) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_json(output_dir / f'{key}_{part}.json') return manifests
def prepare_musan( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ("music", "speech", "noise"), use_vocals: bool = True, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if not parts: raise ValueError("No MUSAN parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = {} if "music" in parts: manifests["music"] = prepare_music(corpus_dir, use_vocals=use_vocals) validate_recordings_and_supervisions(**manifests["music"]) if "speech" in parts: manifests["speech"] = {"recordings": scan_recordings(corpus_dir / "speech")} validate(manifests["speech"]["recordings"]) if "noise" in parts: manifests["noise"] = {"recordings": scan_recordings(corpus_dir / "noise")} validate(manifests["noise"]["recordings"]) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_file(output_dir / f"musan_{key}_{part}.jsonl.gz") return manifests
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ("train", "dev", "test"): root = tedlium_root / "legacy" / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / "sph").glob("*.sph")) stms = list((root / "stm").glob("*.stm")) assert len(stms) == len(recordings), ( f"Mismatch: found {len(recordings)} " f"sphere files and {len(stms)} STM files. " f"You might be missing some parts of TEDLIUM...") segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = " ".join(words).replace("{NOISE}", "[NOISE]") if text == "ignore_time_segment_in_scoring": continue segments.append( SupervisionSegment( id=f"{rec_id}-{idx}", recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language="English", speaker=rec_id, )) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { "recordings": recordings, "supervisions": supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") return corpus
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, )) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_nsc( corpus_dir: Pathlike, dataset_part: str = 'PART3_SameCloseMic', output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path to the raw corpus distribution. :param dataset_part: str, name of the dataset part to be prepared. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ check_dependencies() corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_part == 'PART3_SameCloseMic': manifests = prepare_same_close_mic(corpus_dir / 'PART3') elif dataset_part == 'PART3_SeparateIVR': manifests = prepare_separate_phone_mic(corpus_dir / 'PART3') else: raise ValueError(f"Unknown dataset part: {dataset_part}") validate_recordings_and_supervisions(**manifests) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests['supervisions'].to_json(output_dir / f'supervisions_{dataset_part}.json') manifests['recordings'].to_json(output_dir / f'recordings_{dataset_part}.json') return manifests
def prepare_single_partition( raw_manifest_path: Path, corpus_dir: Path, speaker_id: str, clean_or_other: str, ): recordings = [] supervisions = [] for meta in load_jsonl(raw_manifest_path): recording = Recording.from_file(corpus_dir / meta["audio_filepath"]) recordings.append(recording) supervisions.append( SupervisionSegment( id=recording.id, recording_id=recording.id, start=0, duration=recording.duration, channel=0, text=meta["text"], speaker=ID2SPEAKER[speaker_id], gender=ID2GENDER[speaker_id], custom={ "text_punct": meta["text_normalized"], "split": clean_or_other }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) return recordings, supervisions
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ('train', 'dev', 'test'): root = tedlium_root / 'legacy' / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / 'sph').glob('*.sph') ) stms = list((root / 'stm').glob('*.stm')) assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \ f'sphere files and {len(stms)} STM files. ' \ f'You might be missing some parts of TEDLIUM...' segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = ' '.join(words).replace('{NOISE}', '[NOISE]') if text == 'ignore_time_segment_in_scoring': continue segments.append( SupervisionSegment( id=f'{rec_id}-{idx}', recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language='English', speaker=rec_id, ) ) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { 'recordings': recordings, 'supervisions': supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_json(output_dir / f'{split}_recordings.json') supervisions.to_json(output_dir / f'{split}_supervisions.json') return corpus
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def prepare_single_commonvoice_tsv( lang: str, part: str, output_dir: Pathlike, lang_path: Pathlike, ) -> Tuple[RecordingSet, SupervisionSet]: """ Prepares part of CommonVoice data from a single TSV file. :param lang: string language code (e.g., "en"). :param part: which split to prepare (e.g., "train", "validated", etc.). :param output_dir: path to directory where we will store the manifests. :param lang_path: path to a CommonVoice directory for a specific language (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl"). :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode, as CommonVoice manifests may be fairly large in memory. """ if not is_module_available("pandas"): raise ValueError( "To prepare CommonVoice data, please 'pip install pandas' first.") import pandas as pd lang_path = Path(lang_path) output_dir = Path(output_dir) tsv_path = lang_path / f"{part}.tsv" # Read the metadata df = pd.read_csv(tsv_path, sep="\t") # Scan all the audio files with RecordingSet.open_writer( output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz", overwrite=False, ) as recs_writer, SupervisionSet.open_writer( output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz", overwrite=False, ) as sups_writer: for idx, row in tqdm( df.iterrows(), desc="Processing audio files", total=len(df), ): try: result = parse_utterance(row, lang_path, lang) if result is None: continue recording, segment = result validate_recordings_and_supervisions(recording, segment) recs_writer.write(recording) sups_writer.write(segment) except Exception as e: logging.error( f"Error when processing TSV file: line no. {idx}: '{row}'.\n" f"Original error type: '{type(e)}' and message: {e}") continue recordings = RecordingSet.from_jsonl_lazy(recs_writer.path) supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path) return recordings, supervisions
def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, "*.sph") sgml_paths = check_and_rglob(transcripts_dir, "*.sgml") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths ) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["sections"] for sups in supervisions_list) ) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["segments"] for sups in supervisions_list) ) validate_recordings_and_supervisions(recordings, segment_supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz") section_supervisions.to_file( output_dir / "broadcast-news_sections_all.jsonl.gz" ) segment_supervisions.to_file( output_dir / "broadcast-news_segments_all.jsonl.gz" ) return { "recordings": recordings, "sections": section_supervisions, "segments": segment_supervisions, }
def validate_(recordings: Pathlike, supervisions: Pathlike, read_data: bool): """ Validate a pair of Lhotse RECORDINGS and SUPERVISIONS manifest files. Checks whether the two manifests are consistent with each other. """ from lhotse import load_manifest, validate_recordings_and_supervisions recs = load_manifest(recordings) sups = load_manifest(supervisions) validate_recordings_and_supervisions( recordings=recs, supervisions=sups, read_data=read_data )
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, _ = line.split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / "supervisions.json") recording_set.to_json(output_dir / "recordings.json") return {"recordings": recording_set, "supervisions": supervision_set}
def prepare_yesno( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. It's expected to contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there are 8 x's and each x is either 1 or 0. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is either "train" or "test", and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) wave_files = list(corpus_dir.glob("*.wav")) assert len(wave_files) == 60 wave_files.sort() train_set = wave_files[::2] test_set = wave_files[1::2] assert len(train_set) == 30 assert len(test_set) == 30 manifests = defaultdict(dict) for name, dataset in zip(["train", "test"], [train_set, test_set]): recordings, supervisions = _prepare_dataset(dataset) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{name}.json") recording_set.to_json(output_dir / f"recordings_{name}.json") manifests[name] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_gale_mandarin( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, segment_words: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Mandarin Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param segment_words: Use `jieba` package to perform word segmentation (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable( [ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ["*.wav", "*.flac"] ] ) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs] ) logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values() ) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths, segment_words=segment_words) ).filter(lambda s: s.recording_id in audio_paths) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) TEST = [ line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url) ] manifests = defaultdict(dict) manifests["dev"] = { "recordings": recordings.filter(lambda r: r.id in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id in TEST), } manifests["train"] = { "recordings": recordings.filter(lambda r: r.id not in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSONL files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "dev"]: manifests[part]["recordings"].to_file( output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz" ) manifests[part]["supervisions"].to_file( output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz" ) return manifests
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt' transcript_dict = {} with open(transcript_path, 'r', encoding='utf-8') as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}' for audio_path in wav_path.rglob('**/*.wav'): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f'No transcript: {idx}') continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "mini_librispeech": dataset_parts = set(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*")) elif dataset_parts == "auto": dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*"))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc="Dataset parts"): logging.info(f"Processing LibriSpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"LibriSpeech subset: {part} already prepared - skipping.") continue recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False): alignments = {} ali_path = trans_path.parent / (trans_path.stem.split(".")[0] + ".alignment.txt") if ali_path.exists(): alignments = parse_alignments(ali_path) # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line, alignments)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.json") recording_set.to_file(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob('*'))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) if maybe_manifests is not None: return maybe_manifests manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc='Dataset parts'): recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob('*.txt'), desc='Distributing tasks', leave=False): # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line)) for future in tqdm(futures, desc='Processing', leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return dict(manifests) # Convert to normal dict
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile( "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo( fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = soundfile.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join( [uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info, text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.samplerate), num_samples=metadata[idx].audio_info.frames, duration=metadata[idx].audio_info.duration) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text) for idx in audio.recordings) validate_recordings_and_supervisions(audio, supervision) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_librimix( librimix_csv: Pathlike, output_dir: Optional[Pathlike] = None, with_precomputed_mixtures: bool = False, sampling_rate: int = 16000, min_segment_seconds: Seconds = 3.0, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: import pandas as pd assert Path(librimix_csv).is_file(), f"No such file: {librimix_csv}" df = pd.read_csv(librimix_csv) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # First, create the audio manifest that specifies the pairs of source recordings # to be mixed together. audio_sources = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["source_1_path"]), AudioSource( type="file", channels=[1], source=row["source_2_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_sources = make_corresponding_supervisions(audio_sources) validate_recordings_and_supervisions(audio_sources, supervision_sources) if output_dir is not None: audio_sources.to_file(output_dir / "librimix_recordings_sources.jsonl.gz") supervision_sources.to_file(output_dir / "librimix_supervisions_sources.jsonl.gz") manifests["sources"] = { "recordings": audio_sources, "supervisions": supervision_sources, } # When requested, create an audio manifest for the pre-computed mixtures. # A different way of performing the mix would be using Lhotse's on-the-fly # overlaying of audio Cuts. if with_precomputed_mixtures: audio_mix = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["mixture_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_mix = make_corresponding_supervisions(audio_mix) validate_recordings_and_supervisions(audio_mix, supervision_mix) if output_dir is not None: audio_mix.to_file(output_dir / "librimix_recordings_mix.jsonl.gz") supervision_mix.to_file(output_dir / "librimix_supervisions_mix.jsonl.gz") manifests["premixed"] = { "recordings": audio_mix, "supervisions": supervision_mix, } # When the LibriMix CSV specifies noises, we create a separate RecordingSet for them, # so that we can extract their features and overlay them as Cuts later. if "noise_path" in df: audio_noise = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["noise_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_noise = make_corresponding_supervisions(audio_noise) validate_recordings_and_supervisions(audio_noise, supervision_noise) if output_dir is not None: audio_noise.to_file(output_dir / "librimix_recordings_noise.jsonl.gz") supervision_noise.to_file(output_dir / "libirmix_supervisions_noise.jsonl.gz") manifests["noise"] = { "recordings": audio_noise, "supervisions": supervision_noise, } return manifests
def prepare_libricss( corpus_dir: Pathlike, output_dir: Pathlike = None, type: str = "mdm", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)`` while creating the CutSet. :param corpus_dir: Pathlike, the path to the extracted corpus. :param output_dir: Pathlike, the path where to write the manifests. :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings are similar to the ones in AMI and ICSI recipes. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ assert type in ["mdm", "ihm-mix", "ihm"] manifests = {} corpus_dir = Path(corpus_dir) corpus_dir = (corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir) recordings = [] segments = [] for ov in OVERLAP_RATIOS: for session in (corpus_dir / ov).iterdir(): _, _, _, _, _, name, actual_ov = session.name.split("_") actual_ov = float(actual_ov.split("actual")[1]) recording_id = f"{ov}_{name}" audio_path = (session / "clean" / "mix.wav" if type == "ihm-mix" else session / "clean" / "each_spk.wav" if type == "ihm" else session / "record" / "raw_recording.wav") recordings.append( Recording.from_file(audio_path, recording_id=recording_id)) for idx, seg in enumerate( parse_transcript(session / "transcription" / "meeting_info.txt")): segments.append( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=seg[0], duration=seg[1] - seg[0], text=seg[4], language="English", speaker=seg[2], channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]] if type == "ihm" else 0, )) supervisions = SupervisionSet.from_segments(segments) recordings = RecordingSet.from_recordings(recordings) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(output_dir / "recordings.jsonl") supervisions.to_jsonl(output_dir / "supervisions.jsonl") return {"recordings": recordings, "supervisions": supervisions}
def prepare_dihard3( dev_audio_dir: Pathlike, eval_audio_dir: Pathlike, output_dir: Optional[Pathlike] = None, uem_manifest: Optional[bool] = True, num_jobs: Optional[int] = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the DIHARD III corpus. We create two manifests: one with recordings, and the other one with supervisions containing speaker id and timestamps. :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g. /data/corpora/LDC/LDC2020E12 :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g. /data/corpora/LDC/LDC2021E02` :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in dataset.DiarizationDataset) :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ manifests = defaultdict(dict) for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"): audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir if audio_dir is None or not Path(audio_dir).exists(): logging.warning(f"Nothing to be done for {part}") continue rttm_paths = list(check_and_rglob(audio_dir, "*.rttm")) uem_paths = list(check_and_rglob(audio_dir, "*.uem")) recordings = RecordingSet.from_dir(audio_dir, "*.flac", num_jobs=num_jobs) # Read metadata for recordings metadata = parse_metadata( list(check_and_rglob(audio_dir, "recordings.tbl"))[0]) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_rttm_segments( rttm_path=[ x for x in rttm_paths if x.stem == recording.id ][0], recording=recording, metadata=metadata[recording.id], ) for recording in recordings)) if uem_manifest: uem = SupervisionSet.from_segments( chain.from_iterable( make_uem_segments( uem_path=[ x for x in uem_paths if x.stem == recording.id ][0], recording=recording, ) for recording in recordings)) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{part}.json") supervisions.to_json(output_dir / f"supervisions_{part}.json") if uem_manifest: uem.to_json(output_dir / f"uem_{part}.json") manifests[part] = { "recordings": recordings, "supervisions": supervisions } if uem_manifest: manifests[part].update({"uem": uem}) return manifests
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def prepare_cmu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CMU Kids corpus. The prepared supervisions contain the prompt text as the `text`. Additionally, in the `custom` tag, we provide the following data: speaker grade/age, population where the speaker came from (SIM95/FP), spoken transcript, and transcription bin (1/2). Here, bin `1` means utterances where the speaker followed the prompt and no noise/mispronunciation is present, and `2` refers to noisy utterances. The tag `spoken_transcript` is the transcription that was actually spoken. It contains noise tags and phone transcription in case the pronunciation differed from that in CMU Dict. :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir recordings = [] supervisions = [] # Get transcripts for all utterances utterances = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f: for line in f: utt, count, text = line.strip().split('\t') utterances[utt] = text # Get speaker metadata speaker_info = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f: for _ in range(2): next(f) for line in f: # ID LOC GR/AGE TOT BIN2 # fabm SUM95 3/9 100 62 # facs SUM95 2/8 90 55 spk, pop, gr_age, _, _ = line.strip().split('\t') grade, age = gr_age.split('/') speaker_info[spk] = (pop, grade, age) # Iterate through all transcriptions and add to supervisions with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f: for line in f: trn_id, transcript = line.strip().split(maxsplit=1) spk = trn_id[0:4] utt = trn_id[4:7] bin = int(trn_id[7]) pop, grade, age = speaker_info[spk] audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' / f'{trn_id}.sph') recording = Recording.from_file( audio_path, relative_path_depth=None if absolute_paths else 3) recordings.append(recording) supervisions.append( SupervisionSegment( id=trn_id, recording_id=trn_id, start=0, duration=recording.duration, speaker=spk, gender="Male" if spk[0] == 'm' else "Female", language='English', text=utterances[utt], custom={ 'speaker_grade': grade if grade != "NA" else None, 'speaker_age': int(age) if age != "NA" else None, 'speaker_population': pop, 'bin': bin, 'spoken_transcript': transcript, }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { 'recordings': recordings, 'supervisions': supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / 'recordings.json') manifests["supervisions"].to_json(output_dir / 'supervisions.json') return manifests
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}') for wav in corpus_dir.rglob('*.wav')) supervisions = [] for path in corpus_dir.rglob('*.txt'): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = 'suitcase_corpus' in path.parts speaker = path.parent.parent.name.lower( ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}' supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language='English', speaker=speaker, gender=speaker_meta[speaker]['gender'], custom={'accent': speaker_meta[speaker]['native_lang']})) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { 'read': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' not in r.id), 'supervisions': supervisions.filter( lambda s: 'suitcase_corpus' not in s.recording_id) }, 'suitcase': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' in r.id), 'supervisions': supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id) } } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests['recordings'].to_json(output_dir / f'recordings-{key}.json') manifests['supervisions'].to_json(output_dir / f'supervisions-{key}.json') return splits
def prepare_aishell4( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AISHELL-4 data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) global_spk_id = {} for part in ["train_L", "train_M", "train_S", "test"]: recordings = [] supervisions = [] wav_path = corpus_dir / part / "wav" for audio_path in wav_path.rglob("*.flac"): idx = audio_path.stem try: tg = textgrid.TextGrid.fromFile( f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid") except ValueError: logging.warning( f"{idx} has annotation issues. Skipping this recording.") continue recording = Recording.from_file(audio_path) recordings.append(recording) for tier in tg.tiers: local_spk_id = tier.name key = (idx, local_spk_id) if key not in global_spk_id: global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}" spk_id = global_spk_id[key] for j, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{idx}-{spk_id}-{j}", recording_id=idx, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.jsonl") recording_set.to_file(output_dir / f"recordings_{part}.jsonl") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_wenet_speech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "all", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: Which parts of dataset to prepare, all for all the parts. :param output_dir: Pathlike, the path where to write the manifests. :num_jobs Number of workers to extract manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts manifests = defaultdict(dict) for sub in subsets: if sub not in WETNET_SPEECH_PARTS: raise ValueError(f"No such part of dataset in WenetSpeech : {sub}") manifests[sub] = {"recordings": [], "supervisions": []} raw_manifests_path = corpus_dir / "WenetSpeech.json" assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}" logging.info(f"Loading raw manifests from : {raw_manifests_path}") raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8")) with ProcessPoolExecutor(num_jobs) as ex: for recording, segments in tqdm( ex.map( parse_utterance, raw_manifests["audios"], repeat(corpus_dir), repeat(subsets), ), desc="Processing WenetSpeech JSON entries", ): for part in segments: manifests[part]["recordings"].append(recording) manifests[part]["supervisions"].extend(segments[part]) for sub in subsets: recordings, supervisions = fix_manifests( recordings=RecordingSet.from_recordings( manifests[sub]["recordings"]), supervisions=SupervisionSet.from_segments( manifests[sub]["supervisions"]), ) validate_recordings_and_supervisions(recordings=recordings, supervisions=supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"supervisions_{sub}.jsonl.gz") recordings.to_file(output_dir / f"recordings_{sub}.jsonl.gz") manifests[sub] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix='libritts') if maybe_manifests is not None: return maybe_manifests # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split('|') for line in ( corpus_dir / 'SPEAKERS.txt').read_text().splitlines() if not line.startswith(';')) } manifests = defaultdict(dict) for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'): part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, '*.wav', num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob('*.trans.tsv'), desc='Scanning transcript files (progbar per speaker)', leave=False): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = { rec_id: float(snr) for rec_id, *_, snr in map(str.split, ( trans_path.parent / trans_path.name.replace('.trans.tsv', '.book.tsv') ).read_text().splitlines()) } for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split('\t') spk_id = rec_id.split('_')[0] supervisions.append( SupervisionSegment(id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language='English', speaker=spk_id, gender=spk2gender[spk_id], custom={ 'orig_text': orig_text, 'snr': utt2snr[rec_id] })) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f'libritts_supervisions_{part}.json') recordings.to_json(output_dir / f'libritts_recordings_{part}.json') manifests[part] = { 'recordings': recordings, 'supervisions': supervisions } return dict(manifests) # Convert to normal dict
def prepare_cmu_indic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Indic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_indic_ben_rm_bn_00001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) lang_code = speaker.split("_")[0] # example: 'ben_rm' -> 'ben' (Bengali) try: # Example contents of voice.feats file: # variant guj # age 28 # gender male # description Built with build_cg_rfs_voice, 3 rf and 3 dur # gujarati_data h2r_prompts # prompt_dur 59.27min age = int( (path.parent / "voice.feats") .read_text() .splitlines()[1] .replace("age ", "") .strip() ) except: age = None for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" language = LANGUAGE_MAP[lang_code] is_english = "arctic" in seg_id # Determine available custom meta-data to attach. custom = None if is_english or age is not None: custom = {} if is_english: custom["accent"] = language if age is not None: custom["age"] = age supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English" if is_english else language, speaker=speaker, gender=GENDER_MAP.get(speaker), custom=custom, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_cslu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, normalize_text: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CSLU Kids corpus. The supervision contains either the prompted text, or a transcription of the spontaneous speech, depending on whether the utterance was scripted or spontaneous. Additionally, the following information is present in the `custom` tag: scripted/spontaneous utterance, and verification label (rating between 1 and 4) for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt or top documentation in this script for more information). :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir # Get list of all recordings audio_paths = check_and_rglob(corpus_dir, "*.wav") # Read verification labels verification = {} for file in check_and_rglob(corpus_dir, "*-verified.txt"): with open(file, "r") as f: for line in f: path, label = line.strip().split() utt = Path(path).stem verification[utt] = int(label) # Read prompted transcriptions prompts = {} with open(corpus_dir / "docs" / "all.map", "r") as f: for line in f: if line.strip() != "": prompt, text = line.strip().split(maxsplit=1) prompts[prompt] = text[1:-1] # remove " " around the text recordings = [] supervisions = [] for p in tqdm(audio_paths, desc="Preparing manifests"): # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav uttid = p.stem # ks001000 spk = p.parent.stem # ks001 cat = p.parent.parent.stem # 0 prompt = p.parent.parent.parent.stem # 00 type = p.parent.parent.parent.parent.stem # scripted recording = Recording.from_file( p, relative_path_depth=None if absolute_paths else 3 ) recordings.append(recording) if type == "scripted": text = prompts[prompt] verification_label = verification[uttid] if uttid in verification else None custom = {"type": type, "verification_label": verification_label} elif type == "spontaneous": text = read_text( corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt", normalize=normalize_text, ) custom = {"type": type} supervisions.append( SupervisionSegment( id=uttid, recording_id=uttid, start=0, duration=recording.duration, speaker=spk, language="English", text=text, custom=custom, ) ) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / "recordings.json") manifests["supervisions"].to_json(output_dir / "supervisions.json") return manifests