示例#1
0
def libri_cut():
    return Cut(
        channel=0,
        duration=16.04,
        features=Features(
            channels=0,
            duration=16.04,
            num_features=40,
            num_frames=1604,
            frame_shift=0.01,
            recording_id='recording-1',
            sampling_rate=16000,
            start=0.0,
            storage_path='test/fixtures/libri/storage',
            storage_key='30c2440c-93cb-4e83-b382-f2a59b3859b4.llc',
            storage_type='lilcom_files',
            type='fbank',
        ),
        recording=Recording(
            id='recording-1',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='test/fixtures/libri/libri-1088-134315-0000.wav',
                )],
            sampling_rate=16000,
            num_samples=256640,
            duration=1604,
        ),
        id='849e13d8-61a2-4d09-a542-dac1aee1b544',
        start=0.0,
        supervisions=[],
    )
示例#2
0
def validate_features(f: Features,
                      read_data: bool = False,
                      feats_data: Optional[np.ndarray] = None) -> None:
    assert f.start >= 0, \
        f'Features: start has to be greater than 0 (is {f.start})'
    assert f.duration > 0, \
        f'Features: duration has to be greater than 0 (is {f.duration})'
    assert f.num_frames > 0, \
        f'Features: num_frames has to be greater than 0 (is {f.num_frames})'
    assert f.num_features > 0, \
        f'Features: num_features has to be greater than 0 (is {f.num_features})'
    assert f.sampling_rate > 0, \
        f'Features: sampling_rate has to be greater than 0 (is {f.sampling_rate})'
    assert f.frame_shift > 0, \
        f'Features: frame_shift has to be greater than 0 (is {f.frame_shift})'
    window_hop = round(f.frame_shift * f.sampling_rate, ndigits=12)
    assert float(int(window_hop)) == window_hop, \
        f'Features: frame_shift of {f.frame_shift} is incorrect because it is physically impossible; ' \
        f'multiplying it by a sampling rate of {f.sampling_rate} results in a fractional window hop ' \
        f'of {window_hop} samples.'
    expected_num_frames = compute_num_frames(duration=f.duration,
                                             frame_shift=f.frame_shift,
                                             sampling_rate=f.sampling_rate)
    assert expected_num_frames == f.num_frames, \
        f'Features: manifest is inconsistent: declared num_frames is {f.num_frames}, ' \
        f'but duration ({f.duration}s) / frame_shift ({f.frame_shift}s) results in {expected_num_frames} frames. ' \
        f'If you\'re using a custom feature extractor, you might need to ensure that it preserves ' \
        f'this relationship between duration, frame_shift and num_frames (use rounding up if needed - ' \
        f'see lhotse.utils.compute_num_frames).'
    if read_data or feats_data is not None:
        if read_data:
            feats_data = f.load()
        n_fr, n_ft = feats_data.shape
        assert f.num_frames == n_fr, f'Features: expected num_frames: {f.num_frames}, actual: {n_fr}'
        assert f.num_features == n_ft, f'Features: expected num_features: {f.num_features}, actual: {n_ft}'
示例#3
0
def overlapping_supervisions_cut():
    return MonoCut(
        id="cut-1",
        start=0.0,
        duration=0.5,
        channel=0,
        features=Features(
            recording_id="recording-1",
            channels=0,
            start=0,
            duration=0.5,
            type="fbank",
            num_frames=50,
            num_features=80,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type="lilcom",
            storage_path="test/fixtures/dummy_feats/storage/",
            storage_key="e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc",
        ),
        supervisions=[
            SupervisionSegment(
                id="s1", recording_id="recording-1", start=0.0, duration=0.2
            ),
            SupervisionSegment(
                id="s2", recording_id="recording-1", start=0.1, duration=0.2
            ),
            SupervisionSegment(
                id="s3", recording_id="recording-1", start=0.2, duration=0.2
            ),
            SupervisionSegment(
                id="s4", recording_id="recording-1", start=0.3, duration=0.2
            ),
        ],
    )
示例#4
0
def libri_cut():
    return Cut(
        channel=0,
        duration=16.04,
        features=Features(
            channels=0,
            duration=16.04,
            num_features=40,
            num_frames=1604,
            recording_id='recording-1',
            sampling_rate=16000,
            start=0.0,
            storage_path=
            'test/fixtures/libri/storage/fc37eb69-43a8-4e6f-a302-646a76606b38.llc',
            storage_type='lilcom',
            type='fbank',
        ),
        recording=Recording(
            id='recording-1',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='test/fixtures/libri/libri-1088-134315-0000.wav',
                )
            ],
            sampling_rate=16000,
            num_samples=256640,
            duration=1604,
        ),
        id='849e13d8-61a2-4d09-a542-dac1aee1b544',
        start=0.0,
        supervisions=[],
    )
示例#5
0
def overlapping_supervisions_cut():
    return Cut(
        id='cut-1',
        start=0.0,
        duration=0.5,
        channel=0,
        features=Features(
            recording_id='recording-1',
            channels=0,
            start=0,
            duration=0.5,
            type='fbank',
            num_frames=50,
            num_features=80,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type='lilcom',
            storage_path='test/fixtures/dummy_feats/storage/',
            storage_key='e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc'
        ),
        supervisions=[
            SupervisionSegment(id='s1', recording_id='recording-1', start=0.0, duration=0.2),
            SupervisionSegment(id='s2', recording_id='recording-1', start=0.1, duration=0.2),
            SupervisionSegment(id='s3', recording_id='recording-1', start=0.2, duration=0.2),
            SupervisionSegment(id='s4', recording_id='recording-1', start=0.3, duration=0.2)
        ]
    )
示例#6
0
def test_feature_set_serialization(format, compressed):
    feature_set = FeatureSet(
        features=[
            Features(
                recording_id='irrelevant',
                channels=0,
                start=0.0,
                duration=20.0,
                type='fbank',
                num_frames=2000,
                num_features=20,
                sampling_rate=16000,
                storage_type='lilcom',
                storage_path='/irrelevant/',
                storage_key='path.llc'
            )
        ]
    )
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'json':
            feature_set.to_json(f.name)
            feature_set_deserialized = FeatureSet.from_json(f.name)
        if format == 'yaml':
            feature_set.to_yaml(f.name)
            feature_set_deserialized = FeatureSet.from_yaml(f.name)
    assert feature_set_deserialized == feature_set
示例#7
0
文件: cut.py 项目: popcornell/lhotse
 def from_dict(data: dict) -> 'Cut':
     feature_info = data.pop('features')
     supervision_infos = data.pop('supervisions')
     return Cut(**data,
                features=Features.from_dict(feature_info),
                supervisions=[
                    SupervisionSegment.from_dict(s)
                    for s in supervision_infos
                ])
示例#8
0
def dummy_features():
    return Features(recording_id='irrelevant',
                    channel_id=0,
                    start=0.0,
                    duration=10.0,
                    type='fbank',
                    num_frames=1000,
                    num_features=80,
                    storage_type='irrelevant',
                    storage_path='irrelevant')
示例#9
0
def dummy_features(unique_id: int) -> Features:
    return Features(recording_id=f'dummy-recording-{unique_id:04d}',
                    channel_id=0,
                    start=0.0,
                    duration=1.0,
                    type='fbank',
                    num_frames=100,
                    num_features=20,
                    storage_type='lilcom',
                    storage_path='irrelevant')
示例#10
0
def features(rec_id, start, duration):
    """Helper method for fixture readability (specify only relevant attributes)."""
    return Features(rec_id,
                    channel_id=0,
                    start=start,
                    duration=duration,
                    type='irrelevant',
                    num_frames=round(duration / 0.01),
                    num_features=23,
                    storage_type='irrelevant',
                    storage_path='irrelevant')
示例#11
0
def dummy_features(unique_id: int) -> Features:
    return Features(recording_id=f'dummy-recording-{unique_id:04d}',
                    channels=0,
                    start=0.0,
                    duration=1.0,
                    type='fbank',
                    num_frames=100,
                    num_features=23,
                    sampling_rate=16000,
                    storage_type='lilcom_files',
                    storage_path='test/fixtures/dummy_feats/storage',
                    storage_key='dbf9a0ec-f79d-4eb8-ae83-143a6d5de64d.llc')
示例#12
0
def dummy_feature_set():
    return FeatureSet.from_features([
        Features(recording_id='rec1',
                 channels=0,
                 start=0,
                 duration=10,
                 type='fbank',
                 num_frames=1000,
                 num_features=23,
                 sampling_rate=16000,
                 storage_type='lilcom',
                 storage_path='dummy.llc')
    ])
示例#13
0
def test_feature_set_prefix_path():
    features = FeatureSet.from_features([
        Features(type='fbank',
                 num_frames=1000,
                 num_features=40,
                 sampling_rate=16000,
                 storage_type='lilcom',
                 storage_path='feats/12345.llc',
                 start=0,
                 duration=10)
    ])
    for feat in features.with_path_prefix('/data'):
        assert feat.storage_path == '/data/feats/12345.llc'
示例#14
0
def dummy_features():
    return Features(recording_id='irrelevant',
                    channels=0,
                    start=0.0,
                    duration=10.0,
                    type='fbank',
                    num_frames=1000,
                    num_features=80,
                    sampling_rate=16000,
                    storage_type='irrelevant',
                    storage_path='irrelevant',
                    storage_key='irrelevant',
                    frame_shift=0.01)
示例#15
0
def features(rec_id, start, duration):
    """Helper method for fixture readability (specify only relevant attributes)."""
    return Features(recording_id=rec_id,
                    channels=0,
                    start=start,
                    duration=duration,
                    sampling_rate=16000,
                    type='irrelevant',
                    num_frames=round(duration / 0.01),
                    num_features=23,
                    storage_type='irrelevant',
                    storage_path='irrelevant',
                    storage_key='irrelevant',
                    frame_shift=0.01)
示例#16
0
def test_feature_set_serialization():
    feature_set = FeatureSet(feature_extractor=FeatureExtractor(),
                             features=[
                                 Features(recording_id='irrelevant',
                                          channel_id=0,
                                          start=0.0,
                                          duration=20.0,
                                          type='fbank',
                                          num_frames=2000,
                                          num_features=20,
                                          storage_type='lilcom',
                                          storage_path='/irrelevant/path.llc')
                             ])
    with NamedTemporaryFile() as f:
        feature_set.to_yaml(f.name)
        feature_set_deserialized = FeatureSet.from_yaml(f.name)
    assert feature_set_deserialized == feature_set
示例#17
0
def test_feature_set_prefix_path():
    features = FeatureSet.from_features([
        Features(
            type="fbank",
            num_frames=1000,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type="lilcom",
            storage_path="feats/",
            storage_key="12345.llc",
            start=0,
            duration=10,
        )
    ])
    for feat in features.with_path_prefix("/data"):
        assert feat.storage_path == "/data/feats"
示例#18
0
def dummy_feature_set():
    return FeatureSet.from_features([
        Features(
            recording_id="rec1",
            channels=0,
            start=0,
            duration=10,
            type="fbank",
            num_frames=1000,
            num_features=23,
            sampling_rate=16000,
            storage_type="lilcom_files",
            storage_path="feats",
            storage_key="dummy.llc",
            frame_shift=0.01,
        )
    ])
示例#19
0
def dummy_feature_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        feats = FeatureSet.from_features([
            Features(
                recording_id="rec1",
                channels=0,
                start=0,
                duration=10,
                type="fbank",
                num_frames=1000,
                num_features=23,
                sampling_rate=16000,
                storage_type="lilcom_files",
                storage_path="feats",
                storage_key="dummy.llc",
                frame_shift=0.01,
            )
        ])
        feats.to_file(f.name)
        f.flush()
        yield FeatureSet.from_jsonl_lazy(f.name)
示例#20
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1] if path_or_cmd.
                    endswith("|") else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id],
                                            sampling_rate),
            duration=durations[recording_id],
        ) for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [
                sup_string.strip().split() for sup_string in f
            ]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            ) for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldi_native_io"):
        if frame_shift is not None:
            import kaldi_native_io

            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldi_native_io",
                    num_frames=mat_shape.num_rows,
                    num_features=mat_shape.num_cols,
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat_shape.num_rows * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[fix_id(utt_id)].
                    recording_id if supervision_set is not None else utt_id,
                    channels=0,
                ) for utt_id, mat_shape in kaldi_native_io.
                SequentialMatrixShapeReader(f"scp:{feats_scp}"))
        else:
            warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: "
                          "frame_shift must be not None. "
                          "Feature import omitted.")

    return recording_set, supervision_set, feature_set