def dummy_supervision_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: sups = SupervisionSet.from_segments( [ SupervisionSegment( id="sup1", recording_id="rec1", start=3, duration=4, channel=0, text="dummy text", ), SupervisionSegment( id="sup2", recording_id="rec1", start=7, duration=2, channel=0, text="dummy text", ), ] ) sups.to_file(f.name) f.flush() yield SupervisionSet.from_jsonl_lazy(f.name)
def trim_supervisions_to_recordings( recordings: RecordingSet, supervisions: SupervisionSet) -> SupervisionSet: """ Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`. """ if recordings.is_lazy: recordings = RecordingSet.from_recordings(iter(recordings)) sups = [] removed = 0 trimmed = 0 for s in supervisions: end = recordings[s.recording_id].duration if s.start > end: removed += 1 continue if s.end > end: trimmed += 1 s = s.trim(recordings[s.recording_id].duration) sups.append(s) if removed: logging.warning( f"Removed {removed} supervisions starting after the end of the recording." ) if trimmed: logging.warning( f"Trimmed {trimmed} supervisions exceeding the end of the recording." ) return SupervisionSet.from_segments(sups)
def search_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='s1', recording_id='r1', start=0, duration=5.0, channel=0), SupervisionSegment(id='s2', recording_id='r1', start=4.5, duration=2.0, channel=1), SupervisionSegment(id='s3', recording_id='r1', start=8.0, duration=3.0, channel=0), SupervisionSegment(id='s4', recording_id='r2', start=1, duration=5.0, channel=0), ])
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, '*.sph') sgml_paths = check_and_rglob(transcripts_dir, '*.sgml') recordings = RecordingSet.from_recordings( Recording.from_sphere( p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups['sections'] for sups in supervisions_list)) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups['segments'] for sups in supervisions_list)) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') section_supervisions.to_json(output_dir / 'sections.json') segment_supervisions.to_json(output_dir / 'segments.json') return { 'recordings': recordings, 'sections': section_supervisions, 'segments': segment_supervisions }
def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({ 'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B'] }) recordings = RecordingSet.from_recordings( Recording.from_sphere( group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_segments(transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence) for group, recording in zip(groups, recordings) for channel in [0, 1])) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return {'recordings': recordings, 'supervisions': supervisions}
def dummy_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='sup1', recording_id='rec1', start=3, duration=4, channel=0, text='dummy text') ])
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest: if type_ == RecordingSet: return RecordingSet.from_recordings( dummy_recording(idx) for idx in range(begin_id, end_id)) if type_ == SupervisionSet: return SupervisionSet.from_segments( dummy_supervision(idx) for idx in range(begin_id, end_id)) if type_ == FeatureSet: # noinspection PyTypeChecker return FeatureSet.from_features( dummy_features(idx) for idx in range(begin_id, end_id))
def dummy_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment( id="sup1", recording_id="rec1", start=3, duration=4, channel=0, text="dummy text", ) ])
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, _ = line.split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / "supervisions.json") recording_set.to_json(output_dir / "recordings.json") return {"recordings": recording_set, "supervisions": supervision_set}
def prepare_yesno( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. It's expected to contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there are 8 x's and each x is either 1 or 0. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is either "train" or "test", and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) wave_files = list(corpus_dir.glob("*.wav")) assert len(wave_files) == 60 wave_files.sort() train_set = wave_files[::2] test_set = wave_files[1::2] assert len(train_set) == 30 assert len(test_set) == 30 manifests = defaultdict(dict) for name, dataset in zip(["train", "test"], [train_set, test_set]): recordings, supervisions = _prepare_dataset(dataset) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{name}.json") recording_set.to_json(output_dir / f"recordings_{name}.json") manifests[name] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def test_known_issue_with_overlap(): r = dummy_recording(0) rec = RecordingSet.from_recordings([r]) # Make two segments. The first segment is 1s long. The segment segment # is 0.3 seconds long and lies entirely within the first. Both have the # same recording_id as the single entry in rec. sup = SupervisionSet.from_segments( [ SupervisionSegment( id="utt1", recording_id=r.id, start=0.0, duration=1.0, channel=0, text="Hello", ), SupervisionSegment( id="utt2", recording_id=r.id, start=0.2, duration=0.5, channel=0, text="World", ), ] ) cuts = CutSet.from_manifests(recordings=rec, supervisions=sup) assert len(cuts) == 1 cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False) assert len(cuts_trim) == 2 cut = cuts_trim[0] assert cut.start == 0 assert cut.duration == 1 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 1 assert sup.text == "Hello" cut = cuts_trim[1] assert cut.start == 0.2 assert cut.duration == 0.5 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 0.5 assert sup.text == "World"
def make_corresponding_supervisions(audio: RecordingSet) -> SupervisionSet: """ Prepare a supervision set - in this case it just describes which segments are available in the corpus, as the actual supervisions for speech separation come from the source recordings. """ return SupervisionSet.from_segments( SupervisionSegment( id=f"{recording.id}-c{source.channels[0]}", recording_id=recording.id, start=0.0, duration=recording.duration, channel=source.channels[0], ) for recording in audio for source in recording.sources)
def prepare_gigaspeech( gigaspeech: Any, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available('speechcolab'): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl') if maybe_manifests is not None: return maybe_manifests manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for part in subsets: futures = [] for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False): futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path)) recordings = [] supervisions = [] for future in tqdm(futures, desc='Processing', leave=False): result = future.result() if result is None: continue recording, segments = result recordings.append(recording) supervisions += segments manifests[part] = { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) } if output_dir is not None: manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl') manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl') return dict(manifests)
def prepare_supervision_ihm( audio: RecordingSet, annotations: Dict[str, List[IcsiSegmentAnnotation]], channel_to_idx_map: Dict[str, Dict[str, int]], ) -> SupervisionSet: # Create a mapping from a tuple of (session_id, channel) to the list of annotations. # This way we can map the supervisions to the right channels in a multi-channel recording. annotation_by_id_and_channel = { (key[0], channel_to_idx_map[key[0]][key[2]]): annotations[key] for key in annotations } segments = [] for recording in tqdm(audio, desc="Preparing supervision"): # IHM can have multiple audio sources for each recording for source in recording.sources: # For each source, "channels" will always be a one-element list (channel, ) = source.channels annotation = annotation_by_id_and_channel.get( (recording.id, channel)) if annotation is None: continue for seg_idx, seg_info in enumerate(annotation): duration = seg_info.end_time - seg_info.start_time # Some annotations in IHM setting exceed audio duration, so we # ignore such segments if seg_info.end_time > recording.duration: logging.warning( f"Segment {recording.id}-{channel}-{seg_idx} exceeds " f"recording duration. Not adding to supervisions.") continue if duration > 0: segments.append( SupervisionSegment( id=f"{recording.id}-{channel}-{seg_idx}", recording_id=recording.id, start=seg_info.start_time, duration=duration, channel=channel, language="English", speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, )) return SupervisionSet.from_segments(segments)
def prepare_supervision_ihm( audio: RecordingSet, annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet: # Create a mapping from a tuple of (session_id, channel) to the list of annotations. # This way we can map the supervisions to the right channels in a multi-channel recording. annotation_by_id_and_channel = {(key[0], key[2]): annotations[key] for key in annotations} segments = [] for recording in audio: # AMI IHM can have multiple audio sources for each recording for source in recording.sources: # For each source, "channels" will always be a one-element list channel, = source.channels annotation = annotation_by_id_and_channel.get( (recording.id, channel)) if annotation is None: logging.warning( f'No annotation found for recording {recording.id} ' f'(file {source.source})') continue for seg_idx, seg_info in enumerate(annotation): duration = seg_info.end_time - seg_info.begin_time # Some annotations in IHM setting exceed audio duration, so we # ignore such segments if seg_info.end_time > recording.duration: logging.warning( f'Segment {recording.id}-{channel}-{seg_idx} exceeds ' f'recording duration. Not adding to supervisions.') continue if duration > 0: segments.append( SupervisionSegment( id=f'{recording.id}-{channel}-{seg_idx}', recording_id=recording.id, start=seg_info.begin_time, duration=duration, channel=channel, language='English', speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text)) return SupervisionSet.from_segments(segments)
def test_supervision_set_serialization(): supervision_set = SupervisionSet.from_segments([ SupervisionSegment( id='segment-1', recording_id='recording-1', channel_id=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male' ) ]) with NamedTemporaryFile() as f: supervision_set.to_yaml(f.name) restored = supervision_set.from_yaml(f.name) assert supervision_set == restored
def validate_recordings_and_supervisions( recordings: Union[RecordingSet, Recording], supervisions: Union[SupervisionSet, SupervisionSegment], read_data: bool = False, ) -> None: """ Validate the recording and supervision manifests separately, and then check if they are consistent with each other. This method will emit warnings, instead of errors, when some recordings or supervisions are missing their counterparts. These items will be discarded by default when creating a CutSet. """ if isinstance(recordings, Recording): recordings = RecordingSet.from_recordings([recordings]) if isinstance(supervisions, SupervisionSegment): supervisions = SupervisionSet.from_segments([supervisions]) validate(recordings, read_data=read_data) validate(supervisions) # Errors for s in supervisions: r = recordings[s.recording_id] assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, ( f"Supervision {s.id}: exceeded the bounds of its corresponding recording " f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])" ) assert s.channel in r.channel_ids, ( f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording " f"(recording channels: {r.channel_ids})") # Warnings recording_ids = frozenset(r.id for r in recordings) recording_ids_in_sups = frozenset(s.recording_id for s in supervisions) only_in_recordings = recording_ids - recording_ids_in_sups if only_in_recordings: logging.warning( f"There are {len(only_in_recordings)} recordings that " f"do not have any corresponding supervisions in the SupervisionSet." ) only_in_supervisions = recording_ids_in_sups - recording_ids if only_in_supervisions: logging.warning( f"There are {len(only_in_supervisions)} supervisions that " f"are missing their corresponding recordings in the RecordingSet.")
def test_supervision_set_serialization(format, compressed): supervision_set = SupervisionSet.from_segments([ SupervisionSegment(id='segment-1', recording_id='recording-1', channel=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male') ]) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'yaml': supervision_set.to_yaml(f.name) restored = supervision_set.from_yaml(f.name) if format == 'json': supervision_set.to_json(f.name) restored = supervision_set.from_json(f.name) assert supervision_set == restored
def __init__( self, cuts: CutSet, uem: Optional[SupervisionSet] = None, min_speaker_dim: Optional[int] = None, global_speaker_ids: bool = False, ) -> None: super().__init__() validate(cuts) if not uem: self.cuts = cuts else: # We use the `overlap` method in intervaltree to get overlapping regions # between the supervision segments and the UEM segments recordings = RecordingSet( {c.recording.id: c.recording for c in cuts if c.has_recording}) uem_intervals = CutSet.from_manifests( recordings=recordings, supervisions=uem, ).index_supervisions() supervisions = [] for cut_id, tree in cuts.index_supervisions().items(): if cut_id not in uem_intervals: supervisions += [it.data for it in tree] continue supervisions += { it.data.trim(it.end, start=it.begin) for uem_it in uem_intervals[cut_id] for it in tree.overlap(begin=uem_it.begin, end=uem_it.end) } self.cuts = CutSet.from_manifests( recordings=recordings, supervisions=SupervisionSet.from_segments(supervisions), ) self.speakers = ({ spk: idx for idx, spk in enumerate(self.cuts.speakers) } if global_speaker_ids else None) self.min_speaker_dim = min_speaker_dim
def prepare_supervision_other( audio: RecordingSet, annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet: annotation_by_id = defaultdict(list) for key, value in annotations.items(): annotation_by_id[key[0]].extend(value) segments = [] for recording in tqdm(audio, desc="Preparing supervisions"): annotation = annotation_by_id.get(recording.id) # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm) # will share supervision. if annotation is None: logging.warning( f"No annotation found for recording {recording.id}") continue if any(len(source.channels) > 1 for source in recording.sources): logging.warning( f"More than 1 channels in recording {recording.id}. " f"Skipping this recording.") continue for seg_idx, seg_info in enumerate(annotation): duration = seg_info.end_time - seg_info.start_time if duration > 0: segments.append( SupervisionSegment( id=f"{recording.id}-{seg_idx}", recording_id=recording.id, start=seg_info.start_time, duration=duration, channel=0, language="English", speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, )) return SupervisionSet.from_segments(segments)
def prepare_supervision_other( audio: RecordingSet, annotations: Dict[str, List[AmiSegmentAnnotation]] ) -> SupervisionSet: annotation_by_id = { (key[0]): annot for key, annot in annotations.items() } segments = [] for recording in audio: annotation = annotation_by_id.get(recording.id) # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm) # will share supervision. source = recording.sources[0] if annotation is None: logging.warning(f'No annotation found for recording {recording.id}') continue if (len(source.channels) > 1): logging.warning(f'More than 1 channels in recording {recording.id}. ' f'Creating supervision for channel 0 only.') for seg_idx, seg_info in enumerate(annotation): duration = seg_info.end_time - seg_info.begin_time if duration > 0: segments.append(SupervisionSegment( id=f'{recording.id}-{seg_idx}', recording_id=recording.id, start=seg_info.begin_time, duration=duration, channel=0, language='English', speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text )) return SupervisionSet.from_segments(segments)
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]: """ Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the corresponding type. When the iterable is empty, returns None. """ items = iter(items) try: first_item = next(items) except StopIteration: return None items = chain([first_item], items) if isinstance(first_item, Recording): return RecordingSet.from_recordings(items) if isinstance(first_item, SupervisionSegment): return SupervisionSet.from_segments(items) if isinstance(first_item, (Cut, MixedCut)): return CutSet.from_cuts(items) if isinstance(first_item, Features): raise ValueError("FeatureSet generic construction from iterable is not possible, as the config information " "would have been lost. Call FeatureSet.from_features() directly instead.") raise ValueError(f"Unknown type of manifest item: {first_item}")
def search_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id="s1", recording_id="r1", start=0, duration=5.0, channel=0), SupervisionSegment(id="s2", recording_id="r1", start=4.5, duration=2.0, channel=1), SupervisionSegment(id="s3", recording_id="r1", start=8.0, duration=3.0, channel=0), SupervisionSegment(id="s4", recording_id="r2", start=1, duration=5.0, channel=0), ])
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Optional[Tuple[str]] = dataset_parts_mini, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean' :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata = {} part_path = corpus_dir / part for trans_path in part_path.rglob('*.txt'): with open(trans_path) as f: for line in f: idx, text = line.split(maxsplit=1) audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=metadata[idx].audio_info.length / metadata[idx].audio_info.rate ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=metadata[idx].text.strip() ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
def prepare_wenet_speech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "all", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: Which parts of dataset to prepare, all for all the parts. :param output_dir: Pathlike, the path where to write the manifests. :num_jobs Number of workers to extract manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts manifests = defaultdict(dict) for sub in subsets: if sub not in WETNET_SPEECH_PARTS: raise ValueError(f"No such part of dataset in WenetSpeech : {sub}") manifests[sub] = {"recordings": [], "supervisions": []} raw_manifests_path = corpus_dir / "WenetSpeech.json" assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}" logging.info(f"Loading raw manifests from : {raw_manifests_path}") raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8")) with ProcessPoolExecutor(num_jobs) as ex: for recording, segments in tqdm( ex.map( parse_utterance, raw_manifests["audios"], repeat(corpus_dir), repeat(subsets), ), desc="Processing WenetSpeech JSON entries", ): for part in segments: manifests[part]["recordings"].append(recording) manifests[part]["supervisions"].extend(segments[part]) for sub in subsets: recordings, supervisions = fix_manifests( recordings=RecordingSet.from_recordings( manifests[sub]["recordings"]), supervisions=SupervisionSet.from_segments( manifests[sub]["supervisions"]), ) validate_recordings_and_supervisions(recordings=recordings, supervisions=supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"supervisions_{sub}.jsonl.gz") recordings.to_file(output_dir / f"recordings_{sub}.jsonl.gz") manifests[sub] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_cmu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CMU Kids corpus. The prepared supervisions contain the prompt text as the `text`. Additionally, in the `custom` tag, we provide the following data: speaker grade/age, population where the speaker came from (SIM95/FP), spoken transcript, and transcription bin (1/2). Here, bin `1` means utterances where the speaker followed the prompt and no noise/mispronunciation is present, and `2` refers to noisy utterances. The tag `spoken_transcript` is the transcription that was actually spoken. It contains noise tags and phone transcription in case the pronunciation differed from that in CMU Dict. :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir recordings = [] supervisions = [] # Get transcripts for all utterances utterances = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f: for line in f: utt, count, text = line.strip().split('\t') utterances[utt] = text # Get speaker metadata speaker_info = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f: for _ in range(2): next(f) for line in f: # ID LOC GR/AGE TOT BIN2 # fabm SUM95 3/9 100 62 # facs SUM95 2/8 90 55 spk, pop, gr_age, _, _ = line.strip().split('\t') grade, age = gr_age.split('/') speaker_info[spk] = (pop, grade, age) # Iterate through all transcriptions and add to supervisions with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f: for line in f: trn_id, transcript = line.strip().split(maxsplit=1) spk = trn_id[0:4] utt = trn_id[4:7] bin = int(trn_id[7]) pop, grade, age = speaker_info[spk] audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' / f'{trn_id}.sph') recording = Recording.from_file( audio_path, relative_path_depth=None if absolute_paths else 3) recordings.append(recording) supervisions.append( SupervisionSegment( id=trn_id, recording_id=trn_id, start=0, duration=recording.duration, speaker=spk, gender="Male" if spk[0] == 'm' else "Female", language='English', text=utterances[utt], custom={ 'speaker_grade': grade if grade != "NA" else None, 'speaker_age': int(age) if age != "NA" else None, 'speaker_population': pop, 'bin': bin, 'spoken_transcript': transcript, }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { 'recordings': recordings, 'supervisions': supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / 'recordings.json') manifests["supervisions"].to_json(output_dir / 'supervisions.json') return manifests
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "mini_librispeech": dataset_parts = set(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*")) elif dataset_parts == "auto": dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*"))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc="Dataset parts"): logging.info(f"Processing LibriSpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"LibriSpeech subset: {part} already prepared - skipping.") continue recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False): alignments = {} ali_path = trans_path.parent / (trans_path.stem.split(".")[0] + ".alignment.txt") if ali_path.exists(): alignments = parse_alignments(ali_path) # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line, alignments)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.json") recording_set.to_file(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_dihard3( dev_audio_dir: Pathlike, eval_audio_dir: Pathlike, output_dir: Optional[Pathlike] = None, uem_manifest: Optional[bool] = True, num_jobs: Optional[int] = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the DIHARD III corpus. We create two manifests: one with recordings, and the other one with supervisions containing speaker id and timestamps. :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g. /data/corpora/LDC/LDC2020E12 :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g. /data/corpora/LDC/LDC2021E02` :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in dataset.DiarizationDataset) :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ manifests = defaultdict(dict) for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"): audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir if audio_dir is None or not Path(audio_dir).exists(): logging.warning(f"Nothing to be done for {part}") continue rttm_paths = list(check_and_rglob(audio_dir, "*.rttm")) uem_paths = list(check_and_rglob(audio_dir, "*.uem")) recordings = RecordingSet.from_dir(audio_dir, "*.flac", num_jobs=num_jobs) # Read metadata for recordings metadata = parse_metadata( list(check_and_rglob(audio_dir, "recordings.tbl"))[0]) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_rttm_segments( rttm_path=[ x for x in rttm_paths if x.stem == recording.id ][0], recording=recording, metadata=metadata[recording.id], ) for recording in recordings)) if uem_manifest: uem = SupervisionSet.from_segments( chain.from_iterable( make_uem_segments( uem_path=[ x for x in uem_paths if x.stem == recording.id ][0], recording=recording, ) for recording in recordings)) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{part}.json") supervisions.to_json(output_dir / f"supervisions_{part}.json") if uem_manifest: uem.to_json(output_dir / f"uem_{part}.json") manifests[part] = { "recordings": recordings, "supervisions": supervisions } if uem_manifest: manifests[part].update({"uem": uem}) return manifests
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt' transcript_dict = {} with open(transcript_path, 'r', encoding='utf-8') as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}' for audio_path in wav_path.rglob('**/*.wav'): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f'No transcript: {idx}') continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests