def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']}) recordings = RecordingSet.from_recordings( Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups ) supervisions = SupervisionSet.from_segments(chain.from_iterable( make_segments( transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence ) for group, recording in zip(groups, recordings) for channel in [0, 1] )) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def test_recording_from_sphere(relative_path_depth, expected_source_path): rec = Recording.from_sphere('test/fixtures/stereo.sph', relative_path_depth=relative_path_depth) assert rec == Recording(id='stereo', sampling_rate=8000, num_samples=8000, duration=1.0, sources=[ AudioSource(type='file', channels=[0, 1], source=expected_source_path) ])
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, '*.sph') sgml_paths = check_and_rglob(transcripts_dir, '*.sgml') recordings = RecordingSet.from_recordings( Recording.from_sphere( p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups['sections'] for sups in supervisions_list)) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups['segments'] for sups in supervisions_list)) validate_recordings_and_supervisions(recordings, segment_supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') section_supervisions.to_json(output_dir / 'sections.json') segment_supervisions.to_json(output_dir / 'segments.json') return { 'recordings': recordings, 'sections': section_supervisions, 'segments': segment_supervisions }