def fix_(recordings: Pathlike, supervisions: Pathlike, output_dir: Pathlike): """ Fix a pair of Lhotse RECORDINGS and SUPERVISIONS manifests. It removes supervisions without corresponding recordings and vice versa, trims the supervisions that exceed the recording, etc. Stores the output files in OUTPUT_DIR under the same names as the input files. """ from lhotse import RecordingSet, SupervisionSet, fix_manifests output_dir = Path(output_dir) recordings = Path(recordings) supervisions = Path(supervisions) output_dir.mkdir(parents=True, exist_ok=True) recs = RecordingSet.from_file(recordings) sups = SupervisionSet.from_file(supervisions) recs, sups = fix_manifests(recordings=recs, supervisions=sups) recs.to_file(output_dir / recordings.name) sups.to_file(output_dir / supervisions.name)
def prepare_wenet_speech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "all", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: Which parts of dataset to prepare, all for all the parts. :param output_dir: Pathlike, the path where to write the manifests. :num_jobs Number of workers to extract manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts manifests = defaultdict(dict) for sub in subsets: if sub not in WETNET_SPEECH_PARTS: raise ValueError(f"No such part of dataset in WenetSpeech : {sub}") manifests[sub] = {"recordings": [], "supervisions": []} raw_manifests_path = corpus_dir / "WenetSpeech.json" assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}" logging.info(f"Loading raw manifests from : {raw_manifests_path}") raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8")) with ProcessPoolExecutor(num_jobs) as ex: for recording, segments in tqdm( ex.map( parse_utterance, raw_manifests["audios"], repeat(corpus_dir), repeat(subsets), ), desc="Processing WenetSpeech JSON entries", ): for part in segments: manifests[part]["recordings"].append(recording) manifests[part]["supervisions"].extend(segments[part]) for sub in subsets: recordings, supervisions = fix_manifests( recordings=RecordingSet.from_recordings( manifests[sub]["recordings"]), supervisions=SupervisionSet.from_segments( manifests[sub]["supervisions"]), ) validate_recordings_and_supervisions(recordings=recordings, supervisions=supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"supervisions_{sub}.jsonl.gz") recordings.to_file(output_dir / f"recordings_{sub}.jsonl.gz") manifests[sub] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_ali_meeting( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "far", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AliMeeting data, please 'pip install textgrid' first." ) import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["Train", "Eval", "Test"]: recordings = [] supervisions = [] # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together) if part == "Eval" or part == "Test": corpus_dir = ( corpus_dir / f"{part}_Ali" if (corpus_dir / f"{part}_Ali").is_dir() else corpus_dir ) wav_paths = corpus_dir / f"{part}_Ali_{mic}" / "audio_dir" text_paths = corpus_dir / f"{part}_Ali_{mic}" / "textgrid_dir" # For 'near' setting: # - wav files have names like R0003_M0046_F_SPK0093.wav # - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid # Speaker ID information is present in the file name itself # For 'far' setting: # - wav files have names like R0015_M0151_MS002.wav # - textgrid files have names like R0015_M015.TextGrid # Speaker ID information is present inside the TextGrid file for text_path in tqdm( list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}" ): session_id = text_path.stem if mic == "near": _, _, gender, spk_id = session_id.split("_") spk_id = spk_id[3:] # SPK1953 -> 1953 try: tg = textgrid.TextGrid.fromFile(str(text_path)) except ValueError: logging.warning( f"{session_id} has annotation issues. Skipping this recording." ) continue wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0] recording = Recording.from_file(wav_path, recording_id=session_id) recordings.append(recording) for tier in tg.tiers: if mic == "far": parts = tier.name.split("_") if len(parts) == 4: _, _, gender, spk_id = parts elif len(parts) == 2: gender, spk_id = parts spk_id = spk_id[3:] # SPK1953 -> 1953 for i, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{session_id}-{spk_id}-{i}", recording_id=recording.id, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, gender=gender, text=text.strip(), ) supervisions.append(segment) recording_set, supervision_set = fix_manifests( RecordingSet.from_recordings(recordings), SupervisionSet.from_segments(supervisions), ) # Fix manifests validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part.lower()}.jsonl") recording_set.to_file(output_dir / f"recordings_{part.lower()}.jsonl") manifests[part.lower()] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_mgb2( corpus_dir: Pathlike, output_dir: Pathlike, text_cleaning: bool = True, buck_walter: bool = False, num_jobs: int = 1, mer_thresh: int = 80, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe). :param buck_walter: Bool, use BuckWalter transliteration :param num_jobs: int, the number of jobs to use for parallel processing. :param mer_thresh: int, filter out segments based on mer (Match Error Rate) :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `text_cleaning` option removes all punctuation and diacritics. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["dev", "train", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: info(f"Processing MGB2 subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz" ): info(f"MGB2 subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. output_dir = Path(output_dir) corpus_dir = Path(corpus_dir) if part == "test" or part == "dev": (output_dir / part).mkdir(parents=True, exist_ok=True) copy( corpus_dir / part / "text.non_overlap_speech", output_dir / part / "text", ) copy( corpus_dir / part / "segments.non_overlap_speech", output_dir / part / "segments", ) with open(corpus_dir / part / "wav.scp", "r") as f_in, open( output_dir / part / "wav.scp", "w" ) as f_out: for line in f_in: f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/")) f_out.write("\n") recordings, supervisions, _ = load_kaldi_data_dir( (output_dir / part), 16000 ) if buck_walter is False: supervisions = supervisions.transform_text(from_buck_walter) if part == "test": assert ( len(supervisions) == 5365 ), f"Expected 5365 supervisions for test, found {len(supervisions)}" elif part == "dev": assert ( len(supervisions) == 5002 ), f"Expected 5002 supervisions for dev, found {len(supervisions)}" elif part == "train": recordings = RecordingSet.from_dir( (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs ) xml_paths = check_and_rglob( path.join(corpus_dir, part, "xml/utf8"), "*.xml" ) # Read supervisions and write them to manifest with recursion_limit(5000): supervisions_list = list( chain.from_iterable( [make_supervisions(p, mer_thresh) for p in xml_paths] ) ) supervisions = SupervisionSet.from_segments(supervisions_list) assert ( len(supervisions) == 375103 ), f"Expected 375103 supervisions for train, found {len(supervisions)}" if text_cleaning is True: supervisions = supervisions.transform_text(cleaning) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # saving recordings and supervisions recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz")) supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz")) manifests[part] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_gigaspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike], dataset_parts: Union[str, Sequence[str]] = "auto", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) gigaspeech = GigaSpeech(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe some manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz", lazy=True, ) for part in subsets: logging.info(f"Processing GigaSpeech subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz" ): logging.info(f"GigaSpeech subset: {part} already prepared - skipping.") continue with RecordingSet.open_writer( output_dir / f"gigaspeech_recordings_{part}.jsonl.gz" ) as rec_writer, SupervisionSet.open_writer( output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz" ) as sup_writer, CutSet.open_writer( output_dir / f"gigaspeech_cuts_{part}.jsonl.gz" ) as cut_writer: for recording, segments in tqdm( parallel_map( parse_utterance, gigaspeech.audios("{" + part + "}"), repeat(gigaspeech.gigaspeech_dataset_dir), num_jobs=num_jobs, ), desc="Processing GigaSpeech JSON entries", ): # Fix and validate the recording + supervisions recordings, segments = fix_manifests( recordings=RecordingSet.from_recordings([recording]), supervisions=SupervisionSet.from_segments(segments), ) validate_recordings_and_supervisions( recordings=recordings, supervisions=segments ) # Create the cut since most users will need it anyway. # There will be exactly one cut since there's exactly one recording. cuts = CutSet.from_manifests( recordings=recordings, supervisions=segments ) # Write the manifests rec_writer.write(recordings[0]) for s in segments: sup_writer.write(s) cut_writer.write(cuts[0]) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), "cuts": CutSet.from_jsonl_lazy(cut_writer.path), } return dict(manifests)
def prepare_aspire( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: str = "single" ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21). :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, the microphone type, either "single" or "multi". :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert mic in [ "single", "multi", ], f"mic must be either 'single' or 'multi', got {mic}" corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data" audio_dir = corpus_dir / "dev_and_dev_test_audio" stm_dir = corpus_dir / "dev_and_dev_test_STM_files" if mic == "single": audio_paths = { "dev": audio_dir / "ASpIRE_single_dev", "dev_test": audio_dir / "ASpIRE_single_dev_test", } stm_file = { "dev": stm_dir / "dev.stm", "dev_test": stm_dir / "dev_test.stm", } else: audio_paths = { "dev": audio_dir / "ASpIRE_multi_dev", "dev_test": audio_dir / "ASpIRE_multi_dev_test", } stm_file = { "dev": stm_dir / "multi_dev.stm", "dev_test": stm_dir / "multi_dev_test.stm", } manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["dev", "dev_test"]: recordings = [] supervisions = [] # Prepare the recordings if mic == "single": recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav") else: import soundfile as sf audio_groups = { k: list(v) for k, v in itertools.groupby( sorted(audio_paths[part].glob("*.wav")), key=lambda x: "_".join(x.stem.split("_")[:-1]), ) } # group audios so that each entry is a session containing all channels for session_name, audios in audio_groups.items(): audio_sf = sf.SoundFile(str(audios[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[int(audio.stem[-2:]) - 1], source=str(audio), ) for audio in sorted(audios) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) recording_set = RecordingSet.from_recordings(recordings) # Read STM file and prepare segments segments = [] with open(stm_file[part]) as f: for line in f: session, _, speaker, start, end, text = line.strip().split( maxsplit=5) segments.append( AspireSegmentAnnotation(session, speaker, float(start), float(end), text)) # Group the segments by session and speaker segments_grouped = defaultdict(list) for segment in segments: segments_grouped[(segment.session, segment.speaker)].append(segment) # Create the supervisions supervisions = [] for k, segs in segments_grouped.items(): session, speaker = k supervisions += [ SupervisionSegment( id=f"{session}-{speaker}-{i:03d}", recording_id=session, start=seg.start, duration=round(seg.end - seg.start, 4), speaker=speaker, text=seg.text, language="English", ) for i, seg in enumerate(segs) ] supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aspire_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aspire_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, "*.sph") text_paths = check_and_rglob(transcripts_dir, "*trans.text") groups = [] name_to_text = {p.stem.split("-")[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace("sw0", "sw") groups.append({ "audio": ap, "text-0": name_to_text[f"{name}A"], "text-1": name_to_text[f"{name}B"], }) recordings = RecordingSet.from_recordings( Recording.from_file(group["audio"], relative_path_depth=None if absolute_paths else 3) for group in groups) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_segments( transcript_path=group[f"text-{channel}"], recording=recording, channel=channel, omit_silence=omit_silence, ) for group, recording in zip(groups, recordings) for channel in [0, 1])) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "swbd_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "swbd_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}