def prepare_callhome_english_sre( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome American English portion prepartion. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory. If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / "fullref.rttm" supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, "*.sph") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome Egyptian Arabic Corpus We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S45`` package. :param transcript_dir: Path to the ``LDC97T19`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["train", "devtest", "evaltest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "callhome/arabic" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f"callhome_arabic_trans_970711/transcrp/{split}/roman", "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( id=f"{recording_id}_{idx}", recording_id=recording_id, start=start, duration=duration, speaker=f"{recording_id}_{spk}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{split}.json") supervisions.to_json(output_dir / f"supervisions_{split}.json") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_fisher_english( corpus_dir: Pathlike, output_dir: Pathlike, audio_dirs: List[str] = FISHER_AUDIO_DIRS, transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS, absolute_paths: bool = False, num_jobs: int = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher English Part 1, 2. Script assumes that audio_dirs and transcript_dirs are in the corpus_path. We create two manifests: one with recordings, and the other one with text supervisions. :param corpus_path: Path to Fisher corpus :param audio_dirs: List of dirs of audio corpora. :param transcripts_dirs: List of dirs of transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for workdir in audio_dirs + transcript_dirs: workdir_path = corpus_dir / workdir if not workdir_path.is_dir(): raise ValueError( f"Could not find '{workdir}' directory inside '{corpus_dir}'.") audio_subdir_paths = [] for audio_dir in audio_dirs: audio_dir_path = corpus_dir / audio_dir for audio_partition_dir in audio_dir_path.iterdir(): audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio" audio_subdir_paths += [ audio_partition_dir_path / audio_subdir for audio_subdir in audio_partition_dir_path.iterdir() ] transcript_subdir_paths = [] for transcript_dir in transcript_dirs: transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans" transcript_subdir_paths += [ transcript_dir_path / transcript_subdir for transcript_subdir in transcript_dir_path.iterdir() ] audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph", "Parsing audio sub-dirs") transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt", "Parsing transcript sub-dirs") sessions = {} for transcript_dir in transcript_dirs: sessions_data_path = check_and_rglob( corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: tmp_sessions = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions.update( {l[0]: { "A": l[5], "B": l[10] } for l in tmp_sessions}) assert len(transcript_paths) == len( audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}" if len(transcript_paths) != len(sessions): warnings.warn( f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, " f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}." ) recs_path = output_dir / "recordings_notfixed.jsonl.gz" if recs_path.is_file(): logging.info(f"Using existing recording manifest at {recs_path}") recordings = RecordingSet.from_jsonl_lazy(recs_path) else: logging.info(f"Building fresh recording manifest") create_recordings_input = [(p, None if absolute_paths else 5) for p in audio_paths] err_recos = 0 with ProcessPoolExecutor( num_jobs) as executor, RecordingSet.open_writer( recs_path) as writer: with tqdm(total=len(create_recordings_input), desc="Collect recordings") as pbar: for reco in executor.map(create_recording, create_recordings_input): if reco is not None: writer.write(reco, flush=True) else: err_recos += 1 pbar.update() if err_recos: warnings.warn(f"Out of {len(create_recordings_input)} recordings, " f"{err_recos} had errors and were omitted.") recordings = writer.open_manifest() sups_path = output_dir / "supervisions_notfixed.jsonl.gz" if sups_path.is_file(): logging.info(f"Using existing supervision manifest at {recs_path}") supervisions = SupervisionSet.from_jsonl_lazy(sups_path) else: logging.info(f"Building fresh supervision manifest") create_supervisions_input = [(sessions, p) for p in transcript_paths] err_sups = 0 with ThreadPoolExecutor(os.cpu_count() * 4) as executor, SupervisionSet.open_writer( sups_path) as writer: with tqdm(total=len(create_supervisions_input), desc="Create supervisions") as pbar: for tmp_supervisions in executor.map( create_supervision, create_supervisions_input): if not tmp_supervisions: err_sups += 1 for s in tmp_supervisions: writer.write(s) pbar.update() supervisions = writer.open_manifest() if err_recos: warnings.warn( f"Out of {len(create_supervisions_input)} transcript files, " f"{err_sups} had errors and were omitted.") recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # Write the fixed and validated version to files with standard names. recordings.to_file(recs_path.parent / "recordings.jsonl.gz") supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_english_asr( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the CallHome American English corpus. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S42`` content :param transcript_dir: Path to the ``LDC97T14`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["evaltest", "train", "devtest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "data" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / "transcrpt" / split, "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 postprocessed_lines = list() for line in p.read_text().splitlines(): line = line.strip() if not line: continue if line.startswith("#"): continue try: start, end, spk, text = line.split(maxsplit=3) duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue postprocessed_lines.append(line) except InvalidOperation: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line except ValueError: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line for line in postprocessed_lines: recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( recording_id=recording_id, start=start, duration=duration, channel=ord(spk[0]) - ord("A"), speaker=f"{recording_id}_{spk:0>2s}", id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / f"callhome-english_recordings_{split}.jsonl.gz") supervisions.to_file( output_dir / f"callhome-english_supervisions_{split}.jsonl.gz") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_ami( data_dir: Pathlike, annotations_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", partition: Optional[str] = "full-corpus", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param annotations: Pathlike, the path of the annotations dir or zip file. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. Example usage: 1. Prepare IHM-Mix data for ASR: >>> manifests = prepare_ami('/path/to/ami-corpus', mic='ihm-mix', partition='full-corpus-asr') 2. Prepare SDM data: >>> manifests = prepare_ami('/path/to/ami-corpus', mic='sdm', partition='full-corpus') """ data_dir = Path(data_dir) assert data_dir.is_dir(), f"No such directory: {data_dir}" assert mic in MICS, f"Mic {mic} not supported" assert partition in PARTITIONS, f"Partition {partition} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing AMI annotations") if not annotations_dir: if (data_dir / "ami_public_manual_1.6.2").is_dir(): annotations_dir = data_dir / "ami_public_manual_1.6.2" elif (data_dir / "ami_public_manual_1.6.2.zip").is_file(): annotations_dir = data_dir / "ami_public_manual_1.6.2.zip" else: raise ValueError( f"No annotations directory specified and no zip file found in {data_dir}" ) # Prepare annotations which is a list of segment-level transcriptions annotations = parse_ami_annotations(annotations_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") wav_dir = data_dir if mic in ["ihm", "mdm"]: audio_paths = (wav_dir.rglob("*Headset-?.wav") if mic == "ihm" else wav_dir.rglob("*Array?-0?.wav")) audio = prepare_audio_grouped(list(audio_paths)) elif mic in ["ihm-mix", "sdm"]: audio_paths = (wav_dir.rglob("*Mix-Headset.wav") if mic == "ihm-mix" else wav_dir.rglob("*Array1-01.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm(audio, annotations) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) dataset_parts = PARTITIONS[partition] for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in dataset_parts[part]) supervision_part = supervision.filter( lambda x: x.recording_id in dataset_parts[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"recordings_{part}.jsonl") supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)
def prepare_fisher_spanish( audio_dir_path: Pathlike, transcript_dir_path: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher Spanish. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir_path: Path to audio directory (usually LDC2010S01). :param transcript_dir_path: Path to transcript directory (usually LDC2010T04). :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path( transcript_dir_path ) audio_paths = check_and_rglob(audio_dir_path, "*.sph") transcript_paths = check_and_rglob(transcript_dir_path, "*.tdf") sessions_data_path = check_and_rglob(transcript_dir_path, "*_call.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: session_lines = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines} assert len(transcript_paths) == len(sessions) == len(audio_paths) create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths] recordings = [None] * len(audio_paths) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(audio_paths), desc="Collect recordings") as pbar: for i, reco in enumerate( executor.map(create_recording, create_recordings_input) ): recordings[i] = reco pbar.update() recordings = RecordingSet.from_recordings(recordings) create_supervisions_input = [(sessions, p) for p in transcript_paths] supervisions = [None] * len(create_supervisions_input) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm( total=len(create_supervisions_input), desc="Create supervisions" ) as pbar: for i, tmp_supervisions in enumerate( executor.map(create_supervision, create_supervisions_input) ): supervisions[i] = tmp_supervisions pbar.update() supervisions = list(it.chain.from_iterable(supervisions)) supervisions = SupervisionSet.from_segments(supervisions).filter( lambda s: s.duration > 0.0 ) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_icsi( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param audio_dir: Pathlike, the path which holds the audio data :param transcripts_dir: Pathlike, the path which holds the transcripts data :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. """ audio_dir = Path(audio_dir) transcripts_dir = Path(transcripts_dir) assert audio_dir.is_dir(), f"No such directory: {audio_dir}" assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing ICSI transcripts") annotations, channel_to_idx_map = parse_icsi_annotations( transcripts_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") channels = "".join(MIC_TO_CHANNELS[mic]) if mic == "ihm" or mic == "mdm": audio_paths = audio_dir.rglob(f"chan[{channels}].sph") audio = prepare_audio_grouped( list(audio_paths), channel_to_idx_map if mic == "ihm" else None) elif mic == "sdm" or mic == "ihm-mix": audio_paths = (audio_dir.rglob(f"chan[{channels}].sph") if len(channels) else audio_dir.rglob("*.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm( audio, annotations, channel_to_idx_map) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in PARTITIONS[part]) supervision_part = supervision.filter( lambda x: x.recording_id in PARTITIONS[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"recordings_{part}.jsonl") supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)