def test_mix_same_recording_channels(): recording = Recording('rec', sampling_rate=8000, num_samples=30 * 8000, duration=30, sources=[ AudioSource('file', channels=[0], source='irrelevant1.wav'), AudioSource('file', channels=[1], source='irrelevant2.wav') ]) cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, recording=recording), Cut('cut2', start=0, duration=30, channel=1, recording=recording) ]) mixed = cut_set.mix_same_recording_channels() assert len(mixed) == 1 cut = mixed[0] assert isinstance(cut, MixedCut) assert len(cut.tracks) == 2 assert cut.tracks[0].cut == cut_set[0] assert cut.tracks[1].cut == cut_set[1]
def test_serialization(format, compressed): recording_set = RecordingSet.from_recordings([ Recording( id='x', sources=[ AudioSource( type='file', channels=[0], source='text/fixtures/mono_c0.wav' ), AudioSource( type='command', channels=[1], source='cat text/fixtures/mono_c1.wav' ) ], sampling_rate=8000, num_samples=4000, duration=0.5 ) ]) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'yaml': recording_set.to_yaml(f.name) deserialized = RecordingSet.from_yaml(f.name) if format == 'json': recording_set.to_json(f.name) deserialized = RecordingSet.from_json(f.name) assert deserialized == recording_set
def test_serialization(): audio_set = RecordingSet.from_recordings([ Recording( id='x', sources=[ AudioSource( type='file', channel_ids=[0], source='text/fixtures/mono_c0.wav' ), AudioSource( type='command', channel_ids=[1], source='cat text/fixtures/mono_c1.wav' ) ], sampling_rate=8000, num_samples=4000, duration_seconds=0.5 ) ]) with NamedTemporaryFile() as f: audio_set.to_yaml(f.name) deserialized = RecordingSet.from_yaml(f.name) assert deserialized == audio_set
def test_audio_url_downloading(): audio_source = AudioSource( type="url", channels=[0], source= "https://github.com/lhotse-speech/lhotse/blob/master/test/fixtures/mono_c0.wav?raw=true", ) audio_source.load_audio()
def parse_utterance( audio: Any, root_path: Path ) -> Optional[Tuple[Recording, List[SupervisionSegment]]]: sampling_rate = int(audio["sample_rate"]) recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=list(range(int(audio["channels"]))), source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples( duration=Seconds(audio["duration"]), sampling_rate=sampling_rate ), sampling_rate=sampling_rate, duration=Seconds(audio["duration"]), ) segments = [] for seg in audio["segments"]: segments.append( SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=Seconds(seg["begin_time"]), duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8), channel=0, language="English", speaker=seg["speaker"], text=seg["text_tn"], ) ) return recording, segments
def libri_cut(): return Cut( channel=0, duration=16.04, features=Features( channels=0, duration=16.04, num_features=40, num_frames=1604, recording_id='recording-1', sampling_rate=16000, start=0.0, storage_path= 'test/fixtures/libri/storage/fc37eb69-43a8-4e6f-a302-646a76606b38.llc', storage_type='lilcom', type='fbank', ), recording=Recording( id='recording-1', sources=[ AudioSource( type='file', channels=[0], source='test/fixtures/libri/libri-1088-134315-0000.wav', ) ], sampling_rate=16000, num_samples=256640, duration=1604, ), id='849e13d8-61a2-4d09-a542-dac1aee1b544', start=0.0, supervisions=[], )
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf recordings = [] for audio_path in tqdm(audio_paths, desc="Preparing audio"): session_name = audio_path.parts[-2] if audio_path.suffix == ".wav": audio_sf = sf.SoundFile(str(audio_path)) num_frames = audio_sf.frames num_channels = audio_sf.channels samplerate = audio_sf.samplerate else: audio_sf, samplerate = read_sph(audio_path) num_channels, num_frames = audio_sf.shape recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=list(range(num_channels)), source=str(audio_path), ) ], sampling_rate=samplerate, num_samples=num_frames, duration=num_frames / samplerate, )) return RecordingSet.from_recordings(recordings)
def dummy_recording(): return Recording( id='irrelevant', sources=[AudioSource(type='file', channels=[0], source='irrelevant')], sampling_rate=16000, num_samples=160000, duration=10.0)
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) recordings = [] for session_name, channel_paths in channel_wavs.items(): audio_sf = sf.SoundFile(str(channel_paths[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource(type="file", channels=[idx], source=str(audio_path)) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def parse_utterance( audio: Any, root_path: Path ) -> Optional[Tuple[Recording, List[SupervisionSegment]]]: # Opus-format audio would be decoded at 48kHz by force, with the original sampling rate being ignored. opus_decoding_sample_rate = 48000 recording = Recording(id=audio['aid'], sources=[AudioSource(type='file', channels=list(range(int(audio['channels']))), source=f'{root_path}/{audio["path"]}')], num_samples=round(opus_decoding_sample_rate * Seconds(audio['duration']), ndigits=8), sampling_rate=opus_decoding_sample_rate, duration=Seconds(audio['duration'])).resample(int(audio['sample_rate'])) segments = [] for seg in audio['segments']: segments.append(SupervisionSegment(id=seg['sid'], recording_id=audio['aid'], start=Seconds(seg['begin_time']), duration=round(Seconds(seg['end_time'] - seg['begin_time']), ndigits=8), channel=0, language='English', speaker=seg['speaker'], text=seg['text_tn'])) return recording, segments
def parse_utterance( audio: Any, root_path: Path, subsets: Sequence ) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]: sampling_rate = 16000 recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=[0], source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples(duration=audio["duration"], sampling_rate=sampling_rate), sampling_rate=sampling_rate, duration=audio["duration"], ) segments = defaultdict(dict) for sub in subsets: segments[sub] = [] for seg in audio["segments"]: segment = SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=seg["begin_time"], duration=add_durations(seg["end_time"], -seg["begin_time"], sampling_rate), language="Chinese", text=seg["text"].strip(), ) for sub in seg["subsets"]: if sub in subsets: segments[sub].append(segment) return recording, segments
def libri_cut(): return Cut( channel=0, duration=16.04, features=Features( channels=0, duration=16.04, num_features=40, num_frames=1604, frame_shift=0.01, recording_id='recording-1', sampling_rate=16000, start=0.0, storage_path='test/fixtures/libri/storage', storage_key='30c2440c-93cb-4e83-b382-f2a59b3859b4.llc', storage_type='lilcom_files', type='fbank', ), recording=Recording( id='recording-1', sources=[ AudioSource( type='file', channels=[0], source='test/fixtures/libri/libri-1088-134315-0000.wav', )], sampling_rate=16000, num_samples=256640, duration=1604, ), id='849e13d8-61a2-4d09-a542-dac1aee1b544', start=0.0, supervisions=[], )
def dummy_recording(): return Recording( id="irrelevant", sources=[AudioSource(type="file", channels=[0], source="irrelevant")], sampling_rate=16000, num_samples=160000, duration=10.0, )
def recording(): return Recording( id='rec', sources=[AudioSource(type='file', channels=[0, 1], source='test/fixtures/stereo.wav')], sampling_rate=8000, num_samples=8000, duration=1.0 )
def cut_with_relative_paths(): return Cut('cut', 0, 10, 0, features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000, storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc', start=0, duration=10), recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0) )
def dummy_recording_set(): return RecordingSet.from_recordings([ Recording( id="rec1", sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type="file", channels=[0], source="dummy.wav") ], ) ])
def recording(): return Recording( id="rec", sources=[ AudioSource(type="file", channels=[0, 1], source="test/fixtures/stereo.wav") ], sampling_rate=8000, num_samples=8000, duration=1.0, )
def dummy_recording_set(): return RecordingSet.from_recordings([ Recording(id='rec1', sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type='file', channels=[0], source='dummy.wav') ]) ])
def test_recording_from_sphere(relative_path_depth, expected_source_path): rec = Recording.from_sphere('test/fixtures/stereo.sph', relative_path_depth=relative_path_depth) assert rec == Recording(id='stereo', sampling_rate=8000, num_samples=8000, duration=1.0, sources=[ AudioSource(type='file', channels=[0, 1], source=expected_source_path) ])
def test_mix_same_recording_channels(): recording = Recording( "rec", sampling_rate=8000, num_samples=30 * 8000, duration=30, sources=[ AudioSource("file", channels=[0], source="irrelevant1.wav"), AudioSource("file", channels=[1], source="irrelevant2.wav"), ], ) cut_set = CutSet.from_cuts([ MonoCut("cut1", start=0, duration=30, channel=0, recording=recording), MonoCut("cut2", start=0, duration=30, channel=1, recording=recording), ]) mixed = cut_set.mix_same_recording_channels() assert len(mixed) == 1 cut = mixed[0] assert isinstance(cut, MixedCut) assert len(cut.tracks) == 2 assert cut.tracks[0].cut == cut_set[0] assert cut.tracks[1].cut == cut_set[1]
def dummy_recording_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: recs = RecordingSet.from_recordings([ Recording( id="rec1", sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type="file", channels=[0], source="dummy.wav") ], ) ]) recs.to_file(f.name) f.flush() yield RecordingSet.from_jsonl_lazy(f.name)
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) recordings = [] for session_name, channel_paths in tqdm(channel_wavs.items(), desc="Processing audio files"): audio_sf = sf.SoundFile(str(channel_paths[0])) sources = [] all_mono = True for idx, audio_path in enumerate(sorted(channel_paths)): audio = sf.SoundFile(str(audio_path)) if audio.channels > 1: logging.warning( f"Skipping recording {session_name} since it has a stereo channel" ) all_mono = False break sources.append( AudioSource(type="file", channels=[idx], source=str(audio_path))) if not all_mono: continue recordings.append( Recording( id=session_name, sources=sources, sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def prepare_audio_grouped( audio_paths: List[Pathlike], channel_to_idx_map: Dict[str, Dict[str, int]] = None, ) -> RecordingSet: # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-2], audio_paths) if channel_to_idx_map is None: channel_to_idx_map = defaultdict(dict) recordings = [] for session_name, channel_paths in tqdm(channel_wavs.items(), desc="Preparing audio"): if session_name not in channel_to_idx_map: channel_to_idx_map[session_name] = { c: idx for idx, c in enumerate(["chanE", "chanF", "chan6", "chan7"]) } audio_sf, samplerate = read_sph(channel_paths[0]) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[ channel_to_idx_map[session_name][audio_path.stem] ], source=str(audio_path), ) for audio_path in sorted(channel_paths) if audio_path.stem in channel_to_idx_map[session_name] ], sampling_rate=samplerate, num_samples=audio_sf.shape[1], duration=audio_sf.shape[1] / samplerate, )) return RecordingSet.from_recordings(recordings)
def cut_with_relative_paths(): return MonoCut( "cut", 0, 10, 0, features=Features( type="fbank", num_frames=1000, num_features=40, sampling_rate=8000, storage_type="lilcom_files", storage_path="storage_dir", storage_key="feats.llc", start=0, duration=10, frame_shift=0.01, ), recording=Recording("rec", [AudioSource("file", [0], "audio.wav")], 8000, 80000, 10.0), )
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf recording_manifest = defaultdict(dict) recordings = [] for audio_path in audio_paths: session_name = audio_path.parts[-3] audio_sf = sf.SoundFile(str(audio_path)) recordings.append( Recording( id=session_name, sources=[ AudioSource(type='file', channels=list(range(audio_sf.channels)), source=str(audio_path)) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf recordings = [] for audio_path in tqdm(audio_paths, desc="Processing audio files"): session_name = audio_path.parts[-3] audio_sf = sf.SoundFile(str(audio_path)) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=list(range(audio_sf.channels)), source=str(audio_path), ) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Optional[Tuple[str]] = dataset_parts_mini, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean' :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata = {} part_path = corpus_dir / part for trans_path in part_path.rglob('*.txt'): with open(trans_path) as f: for line in f: idx, text = line.split(maxsplit=1) audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=metadata[idx].audio_info.length / metadata[idx].audio_info.rate ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=metadata[idx].text.strip() ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile( "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo( fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = soundfile.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join( [uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info, text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.samplerate), num_samples=metadata[idx].audio_info.frames, duration=metadata[idx].audio_info.duration) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text) for idx in audio.recordings) validate_recordings_and_supervisions(audio, supervision) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_librimix( librimix_csv: Pathlike, output_dir: Optional[Pathlike] = None, with_precomputed_mixtures: bool = False, sampling_rate: int = 16000, min_segment_seconds: Seconds = 3.0, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: import pandas as pd assert Path(librimix_csv).is_file(), f"No such file: {librimix_csv}" df = pd.read_csv(librimix_csv) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # First, create the audio manifest that specifies the pairs of source recordings # to be mixed together. audio_sources = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["source_1_path"]), AudioSource( type="file", channels=[1], source=row["source_2_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_sources = make_corresponding_supervisions(audio_sources) validate_recordings_and_supervisions(audio_sources, supervision_sources) if output_dir is not None: audio_sources.to_file(output_dir / "librimix_recordings_sources.jsonl.gz") supervision_sources.to_file(output_dir / "librimix_supervisions_sources.jsonl.gz") manifests["sources"] = { "recordings": audio_sources, "supervisions": supervision_sources, } # When requested, create an audio manifest for the pre-computed mixtures. # A different way of performing the mix would be using Lhotse's on-the-fly # overlaying of audio Cuts. if with_precomputed_mixtures: audio_mix = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["mixture_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_mix = make_corresponding_supervisions(audio_mix) validate_recordings_and_supervisions(audio_mix, supervision_mix) if output_dir is not None: audio_mix.to_file(output_dir / "librimix_recordings_mix.jsonl.gz") supervision_mix.to_file(output_dir / "librimix_supervisions_mix.jsonl.gz") manifests["premixed"] = { "recordings": audio_mix, "supervisions": supervision_mix, } # When the LibriMix CSV specifies noises, we create a separate RecordingSet for them, # so that we can extract their features and overlay them as Cuts later. if "noise_path" in df: audio_noise = RecordingSet.from_recordings( Recording( id=row["mixture_ID"], sources=[ AudioSource( type="file", channels=[0], source=row["noise_path"]), ], sampling_rate=sampling_rate, num_samples=int(row["length"]), duration=row["length"] / sampling_rate, ) for idx, row in df.iterrows() if row["length"] / sampling_rate > min_segment_seconds) supervision_noise = make_corresponding_supervisions(audio_noise) validate_recordings_and_supervisions(audio_noise, supervision_noise) if output_dir is not None: audio_noise.to_file(output_dir / "librimix_recordings_noise.jsonl.gz") supervision_noise.to_file(output_dir / "libirmix_supervisions_noise.jsonl.gz") manifests["noise"] = { "recordings": audio_noise, "supervisions": supervision_noise, } return manifests