def test_load_features( recording_id: str, channel: int, start: float, duration: float, exception_expectation, expected_num_frames: Optional[float] ): # just test that it loads feature_set = FeatureSet.from_json('test/fixtures/dummy_feats/feature_manifest.json') with exception_expectation: features = feature_set.load(recording_id, channel_id=channel, start=start, duration=duration) # expect a matrix assert len(features.shape) == 2 # expect time as the first dimension assert features.shape[0] == expected_num_frames
def test_compute_global_stats(): feature_set = FeatureSet.from_json('test/fixtures/dummy_feats/feature_manifest.json') with NamedTemporaryFile() as f: stats = feature_set.compute_global_stats(storage_path=f.name) f.flush() read_stats = pickle.load(f) # Post-condition 1: feature dim is consistent assert stats['norm_means'].shape == (feature_set[0].num_features,) assert stats['norm_stds'].shape == (feature_set[0].num_features,) # Post-condition 2: the iterative method yields very close results to # the "standard" method. true_means = np.mean(np.concatenate([f.load() for f in feature_set]), axis=0) true_stds = np.std(np.concatenate([f.load() for f in feature_set]), axis=0) np.testing.assert_almost_equal(stats['norm_means'], true_means, decimal=5) np.testing.assert_almost_equal(stats['norm_stds'], true_stds, decimal=5) # Post-condition 3: the serialization works correctly assert (stats['norm_means'] == read_stats['norm_means']).all() assert (stats['norm_stds'] == read_stats['norm_stds']).all()
def windowed( feature_manifest: Pathlike, output_cut_manifest: Pathlike, cut_duration: float, cut_shift: Optional[float], keep_shorter_windows: bool ): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST from feature regions in FEATURE_MANIFEST. The feature matrices are traversed in windows with CUT_SHIFT increments, creating cuts of constant CUT_DURATION. """ feature_set = FeatureSet.from_json(feature_manifest) cut_set = make_windowed_cuts_from_features( feature_set=feature_set, cut_duration=cut_duration, cut_shift=cut_shift, keep_shorter_windows=keep_shorter_windows ) cut_set.to_file(output_cut_manifest)
def test_feature_set_serialization(format, compressed): feature_set = FeatureSet(features=[ Features(recording_id='irrelevant', channels=0, start=0.0, duration=20.0, type='fbank', num_frames=2000, num_features=20, sampling_rate=16000, storage_type='lilcom', storage_path='/irrelevant/path.llc') ]) with NamedTemporaryFile(suffix='.gz' if compressed else '') as f: if format == 'json': feature_set.to_json(f.name) feature_set_deserialized = FeatureSet.from_json(f.name) if format == 'yaml': feature_set.to_yaml(f.name) feature_set_deserialized = FeatureSet.from_yaml(f.name) assert feature_set_deserialized == feature_set
def random_mixed( supervision_manifest: Pathlike, feature_manifest: Pathlike, output_cut_manifest: Pathlike, snr_range: Tuple[float, float], offset_range: Tuple[float, float], ): """ Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains supervision regions from SUPERVISION_MANIFEST and features supplied by FEATURE_MANIFEST. It first creates a trivial CutSet, splits it into two equal, randomized parts and mixes their features. The parameters of the mix are controlled via SNR_RANGE and OFFSET_RANGE. """ import numpy as np from lhotse.supervision import SupervisionSet from lhotse.features import FeatureSet supervision_set = SupervisionSet.from_json(supervision_manifest) feature_set = FeatureSet.from_json(feature_manifest) source_cut_set = CutSet.from_manifests( supervisions=supervision_set, features=feature_set ) left_cuts, right_cuts = source_cut_set.split(num_splits=2, shuffle=True) snrs = np.random.uniform(*snr_range, size=len(left_cuts)).tolist() relative_offsets = np.random.uniform(*offset_range, size=len(left_cuts)).tolist() mixed_cut_set = CutSet.from_cuts( left_cut.mix( right_cut, offset_other_by=left_cut.duration * relative_offset, snr=snr ) for left_cut, right_cut, snr, relative_offset in zip( left_cuts, right_cuts, snrs, relative_offsets ) ) mixed_cut_set.to_file(output_cut_manifest)
def test_load_features_with_default_arguments(): feature_set = FeatureSet.from_json( "test/fixtures/dummy_feats/feature_manifest.json") features = feature_set.load("recording-1") assert features.shape == (50, 23)
def libri_features_set(): return FeatureSet.from_json('test/fixtures/libri/feature_manifest.json.gz')