def prepare_callhome_english( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / 'fullref.rttm' supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths) ) recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def prepare_gale_mandarin( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, segment_words: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Mandarin Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param segment_words: Use `jieba` package to perform word segmentation (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable( [ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ["*.wav", "*.flac"] ] ) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs] ) logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values() ) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths, segment_words=segment_words) ).filter(lambda s: s.recording_id in audio_paths) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) TEST = [ line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url) ] manifests = defaultdict(dict) manifests["dev"] = { "recordings": recordings.filter(lambda r: r.id in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id in TEST), } manifests["train"] = { "recordings": recordings.filter(lambda r: r.id not in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSONL files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "dev"]: manifests[part]["recordings"].to_file( output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz" ) manifests[part]["supervisions"].to_file( output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz" ) return manifests
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def prepare_gale_arabic( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: bool = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Arabic Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable([ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ['*.wav', '*.flac'] ]) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs]) transcript_paths = [p for p in transcript_paths] logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values()) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths)) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = defaultdict(dict) manifests['test'] = { 'recordings': recordings.filter(lambda r: r.id in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id in TEST), } manifests['train'] = { 'recordings': recordings.filter(lambda r: r.id not in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "test"]: manifests[part]["recordings"].to_json(output_dir / f'recordings_{part}.json') manifests[part]["supervisions"].to_json( output_dir / f'supervisions_{part}.json') return manifests
def prepare_single_mtedx_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, language: str = "language", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single MTEDx language. This function works as follows: - First it looks for the audio directory in the data/wav where the .flac files are stored. - Then, it looks for the vtt directory in data/{train,dev,test}/vtt which contains the segmentation and transcripts for the audio. - The transcripts undergo some basic text normalization :param corpus_dir: Path to the root of the MTEDx download :param output_dir: Path where the manifests are stored as .json files :param language: The two-letter language code. :param num_jobs: Number of threads to use when preparing data. :return: """ if isinstance(corpus_dir, str): corpus_dir = Path(corpus_dir) manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for split in ("train", "valid", "test"): audio_dir = corpus_dir / f"data/{split}/wav" recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.flac") ) if len(recordings) == 0: logging.warning(f"No .flac files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"data/{split}/vtt" futures = [] for p in text_dir.glob("*"): futures.append(ex.submit(_filename_to_supervisions, p, language)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue for sup in result: supervisions.append(sup) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: if isinstance(output_dir, str): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) save_split = "dev" if split == "valid" else split recordings.to_file(output_dir / f"recordings_{language}_{split}.json") supervisions.to_file( output_dir / f"supervisions_{language}_{split}.json" ) return dict(manifests)
def prepare_fisher_english( corpus_path: Pathlike, audio_dirs: List[str] = FISHER_AUDIO_DIRS, transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher English Part 1, 2. Script assumes that audio_dirs and transcript_dirs are in the corpus_path. We create two manifests: one with recordings, and the other one with text supervisions. :param corpus_path: Path to Fisher corpus :param audio_dirs: List of dirs of audio corpora. :param transcripts_dirs: List of dirs of transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_path = Path(corpus_path) for workdir in audio_dirs + transcript_dirs: workdir_path = corpus_path / workdir if not workdir_path.is_dir(): raise ValueError( f"Could not find '{workdir}' directory inside '{corpus_path}'." ) audio_subdir_paths = [] for audio_dir in audio_dirs: audio_dir_path = corpus_path / audio_dir for audio_partition_dir in audio_dir_path.iterdir(): audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio" audio_subdir_paths += [ audio_partition_dir_path / audio_subdir for audio_subdir in audio_partition_dir_path.iterdir() ] transcript_subdir_paths = [] for transcript_dir in transcript_dirs: transcript_dir_path = corpus_path / transcript_dir / "data" / "trans" transcript_subdir_paths += [ transcript_dir_path / transcript_subdir for transcript_subdir in transcript_dir_path.iterdir() ] audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph", "Parsing audio sub-dirs") transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt", "Parsing transcript sub-dirs") sessions = {} for transcript_dir in transcript_dirs: sessions_data_path = check_and_rglob( corpus_path / transcript_dir / "doc", "*_calldata.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: tmp_sessions = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions.update( {l[0]: { "A": l[5], "B": l[10] } for l in tmp_sessions}) assert len(transcript_paths) == len(sessions) == len(audio_paths) create_recordings_input = [(p, None if absolute_paths else 5) for p in audio_paths] recordings = [None] * len(audio_paths) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(create_recordings_input), desc="Collect recordings") as pbar: for i, reco in enumerate( executor.map(create_recording, create_recordings_input)): recordings[i] = reco pbar.update() recordings = RecordingSet.from_recordings(recordings) create_supervisions_input = [(sessions, p) for p in transcript_paths] supervisions = [None] * len(create_supervisions_input) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(create_supervisions_input), desc="Create supervisions") as pbar: for i, tmp_supervisions in enumerate( executor.map(create_supervision, create_supervisions_input)): supervisions[i] = tmp_supervisions pbar.update() supervisions = list(it.chain.from_iterable(supervisions)) supervisions = SupervisionSet.from_segments(supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "recordings.jsonl.gz") supervisions.to_file(output_dir / "supervisions.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ['train', 'devtest', 'evaltest']: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / 'callhome/arabic' / split.replace('evaltest', 'evltest'), '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt') # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment(id=f'{recording_id}_{idx}', recording_id=recording_id, start=start, duration=duration, speaker=f'{recording_id}_{spk}', text=text)) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f'recordings_{split}.json') supervisions.to_json(output_dir / f'supervisions_{split}.json') manifests[split] = { 'recordings': recordings, 'supervisions': supervisions } return manifests