예제 #1
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            feature_manifest: Optional[Pathlike], storage_type: str,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
        )
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
예제 #2
0
def test_serialization(format, compressed):
    recording_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channels=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration=0.5
        )
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            recording_set.to_yaml(f.name)
            deserialized = RecordingSet.from_yaml(f.name)
        if format == 'json':
            recording_set.to_json(f.name)
            deserialized = RecordingSet.from_json(f.name)
    assert deserialized == recording_set
예제 #3
0
def test_feature_set_builder(storage_fn):
    recordings: RecordingSet = RecordingSet.from_json(
        "test/fixtures/audio.json")
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with storage_fn() as storage:
        builder = FeatureSetBuilder(
            feature_extractor=extractor,
            storage=storage,
        )
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == "kaldi-fbank"
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_filters
        # assert that the storage type metadata matches
        assert features.storage_type == storage.name
        # assert that the metadata is consistent with the data shapes
        arr = features.load()
        assert arr.shape[0] == features.num_frames
        assert arr.shape[1] == features.num_features
        # assert that the stored features are the same as the "freshly extracted" features
        recording = recordings[features.recording_id]
        expected = extractor.extract(
            samples=recording.load_audio(channels=features.channels),
            sampling_rate=recording.sampling_rate,
        )
        np.testing.assert_almost_equal(arr, expected, decimal=2)

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
예제 #4
0
def test_feature_set_builder_with_augmentation():
    recordings: RecordingSet = RecordingSet.from_json(
        'test/fixtures/audio.json')
    augment_fn = WavAugmenter.create_predefined('pitch_reverb_tdrop',
                                                sampling_rate=8000)
    extractor = Fbank()
    with TemporaryDirectory() as d, LilcomFilesWriter(d) as storage:
        builder = FeatureSetBuilder(feature_extractor=extractor,
                                    storage=storage,
                                    augment_fn=augment_fn)
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

        assert len(feature_set) == 6

        feature_infos = list(feature_set)

        # Assert the properties shared by all features
        for features in feature_infos:
            # assert that fbank is the default feature type
            assert features.type == 'fbank'
            # assert that duration is always a multiple of frame_shift
            assert features.num_frames == round(features.duration /
                                                features.frame_shift)
            # assert that num_features is preserved
            assert features.num_features == builder.feature_extractor.config.num_mel_bins
            # assert that the storage type metadata matches
            assert features.storage_type == storage.name
            # assert that the metadata is consistent with the data shapes
            arr = features.load()
            assert arr.shape[0] == features.num_frames
            assert arr.shape[1] == features.num_features

        # Assert the properties for recordings of duration 0.5 seconds
        for features in feature_infos[:2]:
            assert features.num_frames == 50
            assert features.duration == 0.5

        # Assert the properties for recordings of duration 1.0 seconds
        for features in feature_infos[2:]:
            assert features.num_frames == 100
            assert features.duration == 1.0
예제 #5
0
def extract(audio_manifest: Pathlike, output_dir: Pathlike,
            segmentation_manifest: Optional[Pathlike], augmentation: str,
            feature_manifest: Optional[Pathlike], compressed: bool,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    audio_set = RecordingSet.from_json(audio_manifest)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    # TODO: to be used (actually, only the segmentation info will be used, and all supervision info will be ignored)
    supervision_set = (SupervisionSet.from_json(segmentation_manifest)
                       if segmentation_manifest is not None else None)

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    augmenter = None
    if augmentation is not None:
        sampling_rate = next(iter(audio_set)).sampling_rate
        assert all(rec.sampling_rate == sampling_rate for rec in audio_set), \
            "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time."
        augmenter = WavAugmenter.create_predefined(name=augmentation,
                                                   sampling_rate=sampling_rate)

    feature_set_builder = FeatureSetBuilder(
        feature_extractor=feature_extractor,
        output_dir=output_dir,
        root_dir=root_dir,
        augmenter=augmenter)
    feature_set_builder.process_and_store_recordings(
        recordings=audio_set,
        segmentation=None,  # TODO: implement and use
        compressed=compressed,
        lilcom_tick_power=lilcom_tick_power,
        num_jobs=num_jobs)
예제 #6
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            augmentation: str, feature_manifest: Optional[Pathlike],
            storage_type: str, lilcom_tick_power: int,
            root_dir: Optional[Pathlike], num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    augmenter = None
    if augmentation is not None:
        sampling_rate = next(iter(recordings)).sampling_rate
        assert all(rec.sampling_rate == sampling_rate for rec in recordings), \
            "Wav augmentation effect chains expect all the recordings to have the same sampling rate at this time."
        augmenter = WavAugmenter.create_predefined(name=augmentation,
                                                   sampling_rate=sampling_rate)

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
            augmenter=augmenter)
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
예제 #7
0
def test_feature_set_builder(augmentation):
    audio_set = RecordingSet.from_json('test/fixtures/audio.json')
    augmenter = WavAugmenter.create_predefined(
        augmentation, sampling_rate=8000) if augmentation is not None else None
    with TemporaryDirectory() as output_dir:
        builder = FeatureSetBuilder(feature_extractor=Fbank(),
                                    output_dir=output_dir,
                                    augmenter=augmenter)
        feature_set = builder.process_and_store_recordings(
            recordings=audio_set)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == 'fbank'
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_mel_bins
        # assert that lilcom is the default storate type
        assert features.storage_type == 'lilcom'

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
예제 #8
0
def libri_recording_set():
    return RecordingSet.from_json('test/fixtures/libri/audio.json')
예제 #9
0
def recording_set() -> RecordingSet:
    return RecordingSet.from_json('test/fixtures/audio.json')