def test_serialization(): audio_set = RecordingSet.from_recordings([ Recording( id='x', sources=[ AudioSource( type='file', channel_ids=[0], source='text/fixtures/mono_c0.wav' ), AudioSource( type='command', channel_ids=[1], source='cat text/fixtures/mono_c1.wav' ) ], sampling_rate=8000, num_samples=4000, duration_seconds=0.5 ) ]) with NamedTemporaryFile() as f: audio_set.to_yaml(f.name) deserialized = RecordingSet.from_yaml(f.name) assert deserialized == audio_set
def test_serialization(format, compressed): recording_set = RecordingSet.from_recordings([ Recording( id='x', sources=[ AudioSource( type='file', channels=[0], source='text/fixtures/mono_c0.wav' ), AudioSource( type='command', channels=[1], source='cat text/fixtures/mono_c1.wav' ) ], sampling_rate=8000, num_samples=4000, duration=0.5 ) ]) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'yaml': recording_set.to_yaml(f.name) deserialized = RecordingSet.from_yaml(f.name) if format == 'json': recording_set.to_json(f.name) deserialized = RecordingSet.from_json(f.name) assert deserialized == recording_set
def prepare_single_commonvoice_tsv( lang: str, part: str, output_dir: Pathlike, lang_path: Pathlike, ) -> Tuple[RecordingSet, SupervisionSet]: """ Prepares part of CommonVoice data from a single TSV file. :param lang: string language code (e.g., "en"). :param part: which split to prepare (e.g., "train", "validated", etc.). :param output_dir: path to directory where we will store the manifests. :param lang_path: path to a CommonVoice directory for a specific language (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl"). :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode, as CommonVoice manifests may be fairly large in memory. """ if not is_module_available("pandas"): raise ValueError( "To prepare CommonVoice data, please 'pip install pandas' first.") import pandas as pd lang_path = Path(lang_path) output_dir = Path(output_dir) tsv_path = lang_path / f"{part}.tsv" # Read the metadata df = pd.read_csv(tsv_path, sep="\t") # Scan all the audio files with RecordingSet.open_writer( output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz", overwrite=False, ) as recs_writer, SupervisionSet.open_writer( output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz", overwrite=False, ) as sups_writer: for idx, row in tqdm( df.iterrows(), desc="Processing audio files", total=len(df), ): try: result = parse_utterance(row, lang_path, lang) if result is None: continue recording, segment = result validate_recordings_and_supervisions(recording, segment) recs_writer.write(recording) sups_writer.write(segment) except Exception as e: logging.error( f"Error when processing TSV file: line no. {idx}: '{row}'.\n" f"Original error type: '{type(e)}' and message: {e}") continue recordings = RecordingSet.from_jsonl_lazy(recs_writer.path) supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path) return recordings, supervisions
def validate_recordings_and_supervisions( recordings: Union[RecordingSet, Recording], supervisions: Union[SupervisionSet, SupervisionSegment], read_data: bool = False, ) -> None: """ Validate the recording and supervision manifests separately, and then check if they are consistent with each other. This method will emit warnings, instead of errors, when some recordings or supervisions are missing their counterparts. These items will be discarded by default when creating a CutSet. """ if isinstance(recordings, Recording): recordings = RecordingSet.from_recordings([recordings]) if isinstance(supervisions, SupervisionSegment): supervisions = SupervisionSet.from_segments([supervisions]) if recordings.is_lazy: recordings = RecordingSet.from_recordings(iter(recordings)) if supervisions.is_lazy: supervisions = SupervisionSet.from_segments(iter(supervisions)) validate(recordings, read_data=read_data) validate(supervisions) # Errors for s in supervisions: r = recordings[s.recording_id] assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, ( f"Supervision {s.id}: exceeded the bounds of its corresponding recording " f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])" ) assert s.channel in r.channel_ids, ( f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording " f"(recording channels: {r.channel_ids})" ) # Warnings recording_ids = frozenset(r.id for r in recordings) recording_ids_in_sups = frozenset(s.recording_id for s in supervisions) only_in_recordings = recording_ids - recording_ids_in_sups if only_in_recordings: logging.warning( f"There are {len(only_in_recordings)} recordings that " f"do not have any corresponding supervisions in the SupervisionSet." ) only_in_supervisions = recording_ids_in_sups - recording_ids if only_in_supervisions: logging.warning( f"There are {len(only_in_supervisions)} supervisions that " f"are missing their corresponding recordings in the RecordingSet." )
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) recordings = [] for session_name, channel_paths in channel_wavs.items(): audio_sf = sf.SoundFile(str(channel_paths[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource(type="file", channels=[idx], source=str(audio_path)) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def trim_supervisions_to_recordings( recordings: RecordingSet, supervisions: SupervisionSet ) -> SupervisionSet: """ Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`. """ if recordings.is_lazy: recordings = RecordingSet.from_recordings(iter(recordings)) sups = [] removed = 0 trimmed = 0 for s in supervisions: end = recordings[s.recording_id].duration if s.start > end: removed += 1 continue if s.end > end: trimmed += 1 s = s.trim(recordings[s.recording_id].duration) sups.append(s) if removed: logging.warning( f"Removed {removed} supervisions starting after the end of the recording." ) if trimmed: logging.warning( f"Trimmed {trimmed} supervisions exceeding the end of the recording." ) return SupervisionSet.from_segments(sups)
def dummy_recording_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: recs = RecordingSet.from_recordings([ Recording( id="rec1", sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type="file", channels=[0], source="dummy.wav") ], ) ]) recs.to_file(f.name) f.flush() yield RecordingSet.from_jsonl_lazy(f.name)
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf recordings = [] for audio_path in tqdm(audio_paths, desc="Preparing audio"): session_name = audio_path.parts[-2] if audio_path.suffix == ".wav": audio_sf = sf.SoundFile(str(audio_path)) num_frames = audio_sf.frames num_channels = audio_sf.channels samplerate = audio_sf.samplerate else: audio_sf, samplerate = read_sph(audio_path) num_channels, num_frames = audio_sf.shape recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=list(range(num_channels)), source=str(audio_path), ) ], sampling_rate=samplerate, num_samples=num_frames, duration=num_frames / samplerate, )) return RecordingSet.from_recordings(recordings)
def remove_missing_recordings_and_supervisions( recordings: RecordingSet, supervisions: SupervisionSet, ) -> Tuple[RecordingSet, SupervisionSet]: """ Fix the recording and supervision manifests by removing all entries that miss their counterparts. :param recordings: a :class:`RecordingSet` object. :param supervisions: a :class:`RecordingSet` object. :return: A pair of :class:`RecordingSet` and :class:`SupervisionSet` with removed entries. """ recording_ids = frozenset(r.id for r in recordings) recording_ids_in_sups = frozenset(s.recording_id for s in supervisions) only_in_recordings = recording_ids - recording_ids_in_sups if only_in_recordings: recordings = recordings.filter( lambda r: r.id not in only_in_recordings) logging.warning( f"Removed {len(only_in_recordings)} recordings with no corresponding supervisions." ) only_in_supervisions = recording_ids_in_sups - recording_ids if only_in_supervisions: supervisions = supervisions.filter( lambda s: s.recording_id not in only_in_supervisions) logging.warning( f"Removed {len(only_in_supervisions)} supervisions with no corresponding recordings." ) return recordings, supervisions
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]: """ Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the corresponding type. When the iterable is empty, returns None. """ items = iter(items) try: first_item = next(items) except StopIteration: return None items = chain([first_item], items) if isinstance(first_item, Recording): return RecordingSet.from_recordings(items) if isinstance(first_item, SupervisionSegment): return SupervisionSet.from_segments(items) if isinstance(first_item, (Cut, MixedCut)): return CutSet.from_cuts(items) if isinstance(first_item, Features): raise ValueError( "FeatureSet generic construction from iterable is not possible, as the config information " "would have been lost. Call FeatureSet.from_features() directly instead." ) raise ValueError(f"Unknown type of manifest item: {first_item}")
def extract(recording_manifest: Pathlike, output_dir: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ recordings: RecordingSet = RecordingSet.from_json(recording_manifest) if root_dir is not None: recordings = recordings.with_path_prefix(root_dir) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage' with get_writer(storage_type)(storage_path, tick_power=lilcom_tick_power) as storage: feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, storage=storage, ) feature_set_builder.process_and_store_recordings( recordings=recordings, output_manifest=output_dir / 'feature_manifest.json.gz', num_jobs=num_jobs)
def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']}) recordings = RecordingSet.from_recordings( Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups ) supervisions = SupervisionSet.from_segments(chain.from_iterable( make_segments( transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence ) for group, recording in zip(groups, recordings) for channel in [0, 1] )) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def test_feature_set_builder(): audio_set = RecordingSet.from_yaml('test/fixtures/audio.yml') with TemporaryDirectory() as output_dir: builder = FeatureSetBuilder(feature_extractor=FeatureExtractor(), output_dir=output_dir) feature_set = builder.process_and_store_recordings( recordings=audio_set) assert len(feature_set) == 4 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == 'fbank' # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.mfcc_fbank_common_config.num_mel_bins # assert that lilcom is the default storate type assert features.storage_type == 'lilcom' # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def test_cut_set_reverb_rir_doesnt_duplicate_transforms(cut_with_supervision, rir): rirs = RecordingSet.from_recordings([rir]) cuts = CutSet.from_cuts( [cut_with_supervision, cut_with_supervision.with_id("other-id")] ) cuts_vp = cuts.reverb_rir(rir_recordings=rirs) for cut in cuts_vp: # This prevents a bug regression where multiple cuts referencing the same recording would # attach transforms to the same manifest assert len(cut.recording.transforms) == 1
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, "*.sph") sgml_paths = check_and_rglob(transcripts_dir, "*.sgml") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths ) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["sections"] for sups in supervisions_list) ) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["segments"] for sups in supervisions_list) ) validate_recordings_and_supervisions(recordings, segment_supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz") section_supervisions.to_file( output_dir / "broadcast-news_sections_all.jsonl.gz" ) segment_supervisions.to_file( output_dir / "broadcast-news_segments_all.jsonl.gz" ) return { "recordings": recordings, "sections": section_supervisions, "segments": segment_supervisions, }
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest: if type_ == RecordingSet: return RecordingSet.from_recordings( dummy_recording(idx) for idx in range(begin_id, end_id)) if type_ == SupervisionSet: return SupervisionSet.from_segments( dummy_supervision(idx) for idx in range(begin_id, end_id)) if type_ == FeatureSet: # noinspection PyTypeChecker return FeatureSet.from_features( dummy_features(idx) for idx in range(begin_id, end_id))
def dummy_recording_set(): return RecordingSet.from_recordings([ Recording(id='rec1', sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type='file', channels=[0], source='dummy.wav') ]) ])
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, _ = line.split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / "supervisions.json") recording_set.to_json(output_dir / "recordings.json") return {"recordings": recording_set, "supervisions": supervision_set}
def dummy_recording_set(): return RecordingSet.from_recordings([ Recording( id="rec1", sampling_rate=16000, num_samples=160000, duration=10, sources=[ AudioSource(type="file", channels=[0], source="dummy.wav") ], ) ])
def test_cut_set_reverb_rir(libri_cut_set, rir, affix_id): rirs = RecordingSet.from_recordings([rir]) perturbed_rvb_cs = libri_cut_set.reverb_rir(rirs, affix_id=affix_id) for original, perturbed_rvb in zip(libri_cut_set, perturbed_rvb_cs): if affix_id: assert original.id != perturbed_rvb.id assert perturbed_rvb.id.endswith(f"_rvb") else: assert original.id == perturbed_rvb.id assert original.sampling_rate == perturbed_rvb.sampling_rate assert original.num_samples == perturbed_rvb.num_samples assert original.load_audio().shape == perturbed_rvb.load_audio().shape
def prepare_yesno( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. It's expected to contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there are 8 x's and each x is either 1 or 0. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is either "train" or "test", and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) wave_files = list(corpus_dir.glob("*.wav")) assert len(wave_files) == 60 wave_files.sort() train_set = wave_files[::2] test_set = wave_files[1::2] assert len(train_set) == 30 assert len(test_set) == 30 manifests = defaultdict(dict) for name, dataset in zip(["train", "test"], [train_set, test_set]): recordings, supervisions = _prepare_dataset(dataset) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{name}.json") recording_set.to_json(output_dir / f"recordings_{name}.json") manifests[name] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def split(manifest: Manifest, num_splits: int, randomize: bool = False) -> List[Manifest]: """Split a manifest into `num_splits` equal parts. The element order can be randomized.""" num_items = len(manifest) if num_splits > num_items: raise ValueError( f"Cannot split manifest into more chunks ({num_splits}) than its number of items {num_items}" ) chunk_size = int(ceil(num_items / num_splits)) split_indices = [(i * chunk_size, min(num_items, (i + 1) * chunk_size)) for i in range(num_splits)] def maybe_randomize(items: Iterable[Any]) -> List[Any]: items = list(items) if randomize: random.shuffle(items) return items if isinstance(manifest, RecordingSet): contents = maybe_randomize(manifest.recordings.items()) return [ RecordingSet(recordings=dict(contents[begin:end])) for begin, end in split_indices ] if isinstance(manifest, SupervisionSet): contents = maybe_randomize(manifest.segments.items()) return [ SupervisionSet(segments=dict(contents[begin:end])) for begin, end in split_indices ] if isinstance(manifest, FeatureSet): contents = maybe_randomize(manifest.features) return [ FeatureSet(features=contents[begin:end], feature_extractor=manifest.feature_extractor) for begin, end in split_indices ] if isinstance(manifest, CutSet): contents = maybe_randomize(manifest.cuts.items()) return [ CutSet(cuts=dict(contents[begin:end])) for begin, end in split_indices ] raise ValueError(f"Unknown type of manifest: {type(manifest)}")
def prepare_gigaspeech( gigaspeech: Any, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available('speechcolab'): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl') if maybe_manifests is not None: return maybe_manifests manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for part in subsets: futures = [] for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False): futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path)) recordings = [] supervisions = [] for future in tqdm(futures, desc='Processing', leave=False): result = future.result() if result is None: continue recording, segments = result recordings.append(recording) supervisions += segments manifests[part] = { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) } if output_dir is not None: manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl') manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl') return dict(manifests)
def test_feature_set_builder(storage_fn): recordings: RecordingSet = RecordingSet.from_json( "test/fixtures/audio.json") extractor = Fbank(FbankConfig(sampling_rate=8000)) with storage_fn() as storage: builder = FeatureSetBuilder( feature_extractor=extractor, storage=storage, ) feature_set = builder.process_and_store_recordings( recordings=recordings) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == "kaldi-fbank" # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_filters # assert that the storage type metadata matches assert features.storage_type == storage.name # assert that the metadata is consistent with the data shapes arr = features.load() assert arr.shape[0] == features.num_frames assert arr.shape[1] == features.num_features # assert that the stored features are the same as the "freshly extracted" features recording = recordings[features.recording_id] expected = extractor.extract( samples=recording.load_audio(channels=features.channels), sampling_rate=recording.sampling_rate, ) np.testing.assert_almost_equal(arr, expected, decimal=2) # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def load_audio(self, recording_set: RecordingSet, root_dir: Optional[Pathlike] = None) -> np.ndarray: """ Load the audio by locating the appropriate recording in the supplied RecordingSet. The audio is trimmed to the [begin, end] range specified by the Cut. Optionally specify a `root_dir` prefix to prefix the features path with. :param recording_set: RecordingSet object containing the Recording pointed to by recording_id member of this Cut. :param root_dir: optional Path prefix to find the recording in the filesystem. :return: a numpy ndarray with audio samples, with shape (1 <channel>, N <samples>) """ return recording_set.load_audio(self.recording_id, channels=self.channel, offset_seconds=self.start, duration_seconds=self.duration, root_dir=root_dir)
def test_feature_set_builder_with_augmentation(): recordings: RecordingSet = RecordingSet.from_json( 'test/fixtures/audio.json') augment_fn = WavAugmenter.create_predefined('pitch_reverb_tdrop', sampling_rate=8000) extractor = Fbank() with TemporaryDirectory() as d, LilcomFilesWriter(d) as storage: builder = FeatureSetBuilder(feature_extractor=extractor, storage=storage, augment_fn=augment_fn) feature_set = builder.process_and_store_recordings( recordings=recordings) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == 'fbank' # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_mel_bins # assert that the storage type metadata matches assert features.storage_type == storage.name # assert that the metadata is consistent with the data shapes arr = features.load() assert arr.shape[0] == features.num_frames assert arr.shape[1] == features.num_features # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def make_feats( audio_manifest: Pathlike, output_dir: Pathlike, segmentation_manifest: Optional[Pathlike], # TODO: augmentation manifest should specify a number of transforms and probability of their application # e.g.: # "add_noise", "prob": 0.5, "noise_recordings": ["path1.wav", "path2.wav"] # "reverberate", "prob": 0.2, "rirs": ["rir1.wav", "rir2.wav"] (or however the RIRs are stored like... can be params for simulation) augmentation_manifest: Optional[Pathlike], feature_manifest: Optional[Pathlike], compressed: bool, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ audio_set = RecordingSet.from_yaml(audio_manifest) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else FeatureExtractor()) # TODO: to be used (actually, only the segmentation info will be used, and all supervision info will be ignored) supervision_set = (SupervisionSet.from_yaml(segmentation_manifest) if segmentation_manifest is not None else None) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, output_dir=output_dir, root_dir=root_dir, augmentation_manifest=augmentation_manifest) feature_set_builder.process_and_store_recordings( recordings=audio_set, segmentation=None, # TODO: implement and use compressed=compressed, lilcom_tick_power=lilcom_tick_power, num_jobs=num_jobs)
def prepare_audio_grouped( audio_paths: List[Pathlike], channel_to_idx_map: Dict[str, Dict[str, int]] = None, ) -> RecordingSet: # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-2], audio_paths) if channel_to_idx_map is None: channel_to_idx_map = defaultdict(dict) recordings = [] for session_name, channel_paths in tqdm(channel_wavs.items(), desc="Preparing audio"): if session_name not in channel_to_idx_map: channel_to_idx_map[session_name] = { c: idx for idx, c in enumerate(["chanE", "chanF", "chan6", "chan7"]) } audio_sf, samplerate = read_sph(channel_paths[0]) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[ channel_to_idx_map[session_name][audio_path.stem] ], source=str(audio_path), ) for audio_path in sorted(channel_paths) if audio_path.stem in channel_to_idx_map[session_name] ], sampling_rate=samplerate, num_samples=audio_sf.shape[1], duration=audio_sf.shape[1] / samplerate, )) return RecordingSet.from_recordings(recordings)
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet: import soundfile as sf # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) recordings = [] for session_name, channel_paths in tqdm(channel_wavs.items(), desc="Processing audio files"): audio_sf = sf.SoundFile(str(channel_paths[0])) sources = [] all_mono = True for idx, audio_path in enumerate(sorted(channel_paths)): audio = sf.SoundFile(str(audio_path)) if audio.channels > 1: logging.warning( f"Skipping recording {session_name} since it has a stereo channel" ) all_mono = False break sources.append( AudioSource(type="file", channels=[idx], source=str(audio_path))) if not all_mono: continue recordings.append( Recording( id=session_name, sources=sources, sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) return RecordingSet.from_recordings(recordings)
def __init__( self, cuts: CutSet, uem: Optional[SupervisionSet] = None, min_speaker_dim: Optional[int] = None, global_speaker_ids: bool = False, ) -> None: super().__init__() validate(cuts) if not uem: self.cuts = cuts else: # We use the `overlap` method in intervaltree to get overlapping regions # between the supervision segments and the UEM segments recordings = RecordingSet( {c.recording.id: c.recording for c in cuts if c.has_recording}) uem_intervals = CutSet.from_manifests( recordings=recordings, supervisions=uem, ).index_supervisions() supervisions = [] for cut_id, tree in cuts.index_supervisions().items(): if cut_id not in uem_intervals: supervisions += [it.data for it in tree] continue supervisions += { it.data.trim(it.end, start=it.begin) for uem_it in uem_intervals[cut_id] for it in tree.overlap(begin=uem_it.begin, end=uem_it.end) } self.cuts = CutSet.from_manifests( recordings=recordings, supervisions=SupervisionSet.from_segments(supervisions), ) self.speakers = ({ spk: idx for idx, spk in enumerate(self.cuts.speakers) } if global_speaker_ids else None) self.min_speaker_dim = min_speaker_dim