def libri_cut(): return Cut( channel=0, duration=16.04, features=Features( channels=0, duration=16.04, num_features=40, num_frames=1604, frame_shift=0.01, recording_id='recording-1', sampling_rate=16000, start=0.0, storage_path='test/fixtures/libri/storage', storage_key='30c2440c-93cb-4e83-b382-f2a59b3859b4.llc', storage_type='lilcom_files', type='fbank', ), recording=Recording( id='recording-1', sources=[ AudioSource( type='file', channels=[0], source='test/fixtures/libri/libri-1088-134315-0000.wav', )], sampling_rate=16000, num_samples=256640, duration=1604, ), id='849e13d8-61a2-4d09-a542-dac1aee1b544', start=0.0, supervisions=[], )
def validate_features(f: Features, read_data: bool = False, feats_data: Optional[np.ndarray] = None) -> None: assert f.start >= 0, \ f'Features: start has to be greater than 0 (is {f.start})' assert f.duration > 0, \ f'Features: duration has to be greater than 0 (is {f.duration})' assert f.num_frames > 0, \ f'Features: num_frames has to be greater than 0 (is {f.num_frames})' assert f.num_features > 0, \ f'Features: num_features has to be greater than 0 (is {f.num_features})' assert f.sampling_rate > 0, \ f'Features: sampling_rate has to be greater than 0 (is {f.sampling_rate})' assert f.frame_shift > 0, \ f'Features: frame_shift has to be greater than 0 (is {f.frame_shift})' window_hop = round(f.frame_shift * f.sampling_rate, ndigits=12) assert float(int(window_hop)) == window_hop, \ f'Features: frame_shift of {f.frame_shift} is incorrect because it is physically impossible; ' \ f'multiplying it by a sampling rate of {f.sampling_rate} results in a fractional window hop ' \ f'of {window_hop} samples.' expected_num_frames = compute_num_frames(duration=f.duration, frame_shift=f.frame_shift, sampling_rate=f.sampling_rate) assert expected_num_frames == f.num_frames, \ f'Features: manifest is inconsistent: declared num_frames is {f.num_frames}, ' \ f'but duration ({f.duration}s) / frame_shift ({f.frame_shift}s) results in {expected_num_frames} frames. ' \ f'If you\'re using a custom feature extractor, you might need to ensure that it preserves ' \ f'this relationship between duration, frame_shift and num_frames (use rounding up if needed - ' \ f'see lhotse.utils.compute_num_frames).' if read_data or feats_data is not None: if read_data: feats_data = f.load() n_fr, n_ft = feats_data.shape assert f.num_frames == n_fr, f'Features: expected num_frames: {f.num_frames}, actual: {n_fr}' assert f.num_features == n_ft, f'Features: expected num_features: {f.num_features}, actual: {n_ft}'
def overlapping_supervisions_cut(): return MonoCut( id="cut-1", start=0.0, duration=0.5, channel=0, features=Features( recording_id="recording-1", channels=0, start=0, duration=0.5, type="fbank", num_frames=50, num_features=80, frame_shift=0.01, sampling_rate=16000, storage_type="lilcom", storage_path="test/fixtures/dummy_feats/storage/", storage_key="e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc", ), supervisions=[ SupervisionSegment( id="s1", recording_id="recording-1", start=0.0, duration=0.2 ), SupervisionSegment( id="s2", recording_id="recording-1", start=0.1, duration=0.2 ), SupervisionSegment( id="s3", recording_id="recording-1", start=0.2, duration=0.2 ), SupervisionSegment( id="s4", recording_id="recording-1", start=0.3, duration=0.2 ), ], )
def libri_cut(): return Cut( channel=0, duration=16.04, features=Features( channels=0, duration=16.04, num_features=40, num_frames=1604, recording_id='recording-1', sampling_rate=16000, start=0.0, storage_path= 'test/fixtures/libri/storage/fc37eb69-43a8-4e6f-a302-646a76606b38.llc', storage_type='lilcom', type='fbank', ), recording=Recording( id='recording-1', sources=[ AudioSource( type='file', channels=[0], source='test/fixtures/libri/libri-1088-134315-0000.wav', ) ], sampling_rate=16000, num_samples=256640, duration=1604, ), id='849e13d8-61a2-4d09-a542-dac1aee1b544', start=0.0, supervisions=[], )
def overlapping_supervisions_cut(): return Cut( id='cut-1', start=0.0, duration=0.5, channel=0, features=Features( recording_id='recording-1', channels=0, start=0, duration=0.5, type='fbank', num_frames=50, num_features=80, frame_shift=0.01, sampling_rate=16000, storage_type='lilcom', storage_path='test/fixtures/dummy_feats/storage/', storage_key='e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc' ), supervisions=[ SupervisionSegment(id='s1', recording_id='recording-1', start=0.0, duration=0.2), SupervisionSegment(id='s2', recording_id='recording-1', start=0.1, duration=0.2), SupervisionSegment(id='s3', recording_id='recording-1', start=0.2, duration=0.2), SupervisionSegment(id='s4', recording_id='recording-1', start=0.3, duration=0.2) ] )
def test_feature_set_serialization(format, compressed): feature_set = FeatureSet( features=[ Features( recording_id='irrelevant', channels=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, sampling_rate=16000, storage_type='lilcom', storage_path='/irrelevant/', storage_key='path.llc' ) ] ) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'json': feature_set.to_json(f.name) feature_set_deserialized = FeatureSet.from_json(f.name) if format == 'yaml': feature_set.to_yaml(f.name) feature_set_deserialized = FeatureSet.from_yaml(f.name) assert feature_set_deserialized == feature_set
def from_dict(data: dict) -> 'Cut': feature_info = data.pop('features') supervision_infos = data.pop('supervisions') return Cut(**data, features=Features.from_dict(feature_info), supervisions=[ SupervisionSegment.from_dict(s) for s in supervision_infos ])
def dummy_features(): return Features(recording_id='irrelevant', channel_id=0, start=0.0, duration=10.0, type='fbank', num_frames=1000, num_features=80, storage_type='irrelevant', storage_path='irrelevant')
def dummy_features(unique_id: int) -> Features: return Features(recording_id=f'dummy-recording-{unique_id:04d}', channel_id=0, start=0.0, duration=1.0, type='fbank', num_frames=100, num_features=20, storage_type='lilcom', storage_path='irrelevant')
def features(rec_id, start, duration): """Helper method for fixture readability (specify only relevant attributes).""" return Features(rec_id, channel_id=0, start=start, duration=duration, type='irrelevant', num_frames=round(duration / 0.01), num_features=23, storage_type='irrelevant', storage_path='irrelevant')
def dummy_features(unique_id: int) -> Features: return Features(recording_id=f'dummy-recording-{unique_id:04d}', channels=0, start=0.0, duration=1.0, type='fbank', num_frames=100, num_features=23, sampling_rate=16000, storage_type='lilcom_files', storage_path='test/fixtures/dummy_feats/storage', storage_key='dbf9a0ec-f79d-4eb8-ae83-143a6d5de64d.llc')
def dummy_feature_set(): return FeatureSet.from_features([ Features(recording_id='rec1', channels=0, start=0, duration=10, type='fbank', num_frames=1000, num_features=23, sampling_rate=16000, storage_type='lilcom', storage_path='dummy.llc') ])
def test_feature_set_prefix_path(): features = FeatureSet.from_features([ Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=16000, storage_type='lilcom', storage_path='feats/12345.llc', start=0, duration=10) ]) for feat in features.with_path_prefix('/data'): assert feat.storage_path == '/data/feats/12345.llc'
def dummy_features(): return Features(recording_id='irrelevant', channels=0, start=0.0, duration=10.0, type='fbank', num_frames=1000, num_features=80, sampling_rate=16000, storage_type='irrelevant', storage_path='irrelevant', storage_key='irrelevant', frame_shift=0.01)
def features(rec_id, start, duration): """Helper method for fixture readability (specify only relevant attributes).""" return Features(recording_id=rec_id, channels=0, start=start, duration=duration, sampling_rate=16000, type='irrelevant', num_frames=round(duration / 0.01), num_features=23, storage_type='irrelevant', storage_path='irrelevant', storage_key='irrelevant', frame_shift=0.01)
def test_feature_set_serialization(): feature_set = FeatureSet(feature_extractor=FeatureExtractor(), features=[ Features(recording_id='irrelevant', channel_id=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, storage_type='lilcom', storage_path='/irrelevant/path.llc') ]) with NamedTemporaryFile() as f: feature_set.to_yaml(f.name) feature_set_deserialized = FeatureSet.from_yaml(f.name) assert feature_set_deserialized == feature_set
def test_feature_set_prefix_path(): features = FeatureSet.from_features([ Features( type="fbank", num_frames=1000, num_features=40, frame_shift=0.01, sampling_rate=16000, storage_type="lilcom", storage_path="feats/", storage_key="12345.llc", start=0, duration=10, ) ]) for feat in features.with_path_prefix("/data"): assert feat.storage_path == "/data/feats"
def dummy_feature_set(): return FeatureSet.from_features([ Features( recording_id="rec1", channels=0, start=0, duration=10, type="fbank", num_frames=1000, num_features=23, sampling_rate=16000, storage_type="lilcom_files", storage_path="feats", storage_key="dummy.llc", frame_shift=0.01, ) ])
def dummy_feature_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: feats = FeatureSet.from_features([ Features( recording_id="rec1", channels=0, start=0, duration=10, type="fbank", num_frames=1000, num_features=23, sampling_rate=16000, storage_type="lilcom_files", storage_path="feats", storage_key="dummy.llc", frame_shift=0.01, ) ]) feats.to_file(f.name) f.flush() yield FeatureSet.from_jsonl_lazy(f.name)
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [ sup_string.strip().split() for sup_string in f ] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat_shape.num_rows, num_features=mat_shape.num_cols, frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat_shape.num_rows * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[fix_id(utt_id)]. recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat_shape in kaldi_native_io. SequentialMatrixShapeReader(f"scp:{feats_scp}")) else: warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted.") return recording_set, supervision_set, feature_set