예제 #1
0
def test_overlay_cut_duration_and_supervisions(offset, allow_padding,
                                               expected_duration,
                                               exception_expectation, cut1,
                                               cut2):
    with exception_expectation:
        mixed_cut = cut1.mix(cut2,
                             offset_other_by=offset,
                             allow_padding=allow_padding)

        assert isinstance(mixed_cut, MixedCut)
        assert mixed_cut.duration == expected_duration
        assert mixed_cut.supervisions == [
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
            SupervisionSegment(id="sup-3",
                               recording_id="irrelevant",
                               start=3.0 + offset,
                               duration=2.5),
        ]
예제 #2
0
def search_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='s1', recording_id='r1', start=0, duration=5.0, channel=0),
        SupervisionSegment(id='s2', recording_id='r1', start=4.5, duration=2.0, channel=1),
        SupervisionSegment(id='s3', recording_id='r1', start=8.0, duration=3.0, channel=0),
        SupervisionSegment(id='s4', recording_id='r2', start=1, duration=5.0, channel=0),
    ])
예제 #3
0
def overlapping_supervisions_cut():
    return MonoCut(
        id="cut-1",
        start=0.0,
        duration=0.5,
        channel=0,
        features=Features(
            recording_id="recording-1",
            channels=0,
            start=0,
            duration=0.5,
            type="fbank",
            num_frames=50,
            num_features=80,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type="lilcom",
            storage_path="test/fixtures/dummy_feats/storage/",
            storage_key="e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc",
        ),
        supervisions=[
            SupervisionSegment(
                id="s1", recording_id="recording-1", start=0.0, duration=0.2
            ),
            SupervisionSegment(
                id="s2", recording_id="recording-1", start=0.1, duration=0.2
            ),
            SupervisionSegment(
                id="s3", recording_id="recording-1", start=0.2, duration=0.2
            ),
            SupervisionSegment(
                id="s4", recording_id="recording-1", start=0.3, duration=0.2
            ),
        ],
    )
예제 #4
0
def overlapping_supervisions_cut():
    return Cut(
        id='cut-1',
        start=0.0,
        duration=0.5,
        channel=0,
        features=Features(
            recording_id='recording-1',
            channels=0,
            start=0,
            duration=0.5,
            type='fbank',
            num_frames=50,
            num_features=80,
            frame_shift=0.01,
            sampling_rate=16000,
            storage_type='lilcom',
            storage_path='test/fixtures/dummy_feats/storage/',
            storage_key='e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc'
        ),
        supervisions=[
            SupervisionSegment(id='s1', recording_id='recording-1', start=0.0, duration=0.2),
            SupervisionSegment(id='s2', recording_id='recording-1', start=0.1, duration=0.2),
            SupervisionSegment(id='s3', recording_id='recording-1', start=0.2, duration=0.2),
            SupervisionSegment(id='s4', recording_id='recording-1', start=0.3, duration=0.2)
        ]
    )
예제 #5
0
파일: test_cut.py 프로젝트: glynpu/lhotse
def dummy_supervision_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        sups = SupervisionSet.from_segments(
            [
                SupervisionSegment(
                    id="sup1",
                    recording_id="rec1",
                    start=3,
                    duration=4,
                    channel=0,
                    text="dummy text",
                ),
                SupervisionSegment(
                    id="sup2",
                    recording_id="rec1",
                    start=7,
                    duration=2,
                    channel=0,
                    text="dummy text",
                ),
            ]
        )
        sups.to_file(f.name)
        f.flush()
        yield SupervisionSet.from_jsonl_lazy(f.name)
예제 #6
0
def test_supervision_set_iteration():
    supervision_set = SupervisionSet(
        segments={
            'X': SupervisionSegment(id='X', recording_id='X', channel=0, start=2.0, duration=2.5),
            'Y': SupervisionSegment(id='Y', recording_id='X', channel=0, start=5.0, duration=5.0),
        }
    )
    assert 2 == len(supervision_set)
    assert 2 == len(list(supervision_set))
예제 #7
0
def test_append_cut_duration_and_supervisions(cut1, cut2):
    appended_cut = cut1.append(cut2)

    assert isinstance(appended_cut, MixedCut)
    assert appended_cut.duration == 20.0
    assert appended_cut.supervisions == [
        SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0),
        SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0),
        SupervisionSegment(id='sup-3', recording_id='irrelevant', start=13.0, duration=2.5)
    ]
예제 #8
0
def test_supervision_custom_attributes():
    sup = SupervisionSegment(id="X", recording_id="X", start=0.0, duration=0.1)
    sup.eye_color = "green"
    sup.wer = 0.41

    assert sup.eye_color == "green"
    assert sup.custom["eye_color"] == "green"

    assert sup.wer == 0.41
    assert sup.custom["wer"] == 0.41

    with pytest.raises(AttributeError):
        sup.nonexistent_attr
예제 #9
0
def test_known_issue_with_overlap():
    r = dummy_recording(0)
    rec = RecordingSet.from_recordings([r])

    # Make two segments. The first segment is 1s long. The segment segment
    # is 0.3 seconds long and lies entirely within the first. Both have the
    # same recording_id as the single entry in rec.
    sup = SupervisionSet.from_segments(
        [
            SupervisionSegment(
                id="utt1",
                recording_id=r.id,
                start=0.0,
                duration=1.0,
                channel=0,
                text="Hello",
            ),
            SupervisionSegment(
                id="utt2",
                recording_id=r.id,
                start=0.2,
                duration=0.5,
                channel=0,
                text="World",
            ),
        ]
    )

    cuts = CutSet.from_manifests(recordings=rec, supervisions=sup)
    assert len(cuts) == 1

    cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False)
    assert len(cuts_trim) == 2

    cut = cuts_trim[0]
    assert cut.start == 0
    assert cut.duration == 1
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 1
    assert sup.text == "Hello"

    cut = cuts_trim[1]
    assert cut.start == 0.2
    assert cut.duration == 0.5
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 0.5
    assert sup.text == "World"
예제 #10
0
def cut1(dummy_features):
    return Cut(id='cut-1',
               start=0.0,
               duration=10.0,
               features=dummy_features,
               supervisions=[
                   SupervisionSegment(id='sup-1',
                                      recording_id='irrelevant',
                                      start=0.5,
                                      duration=6.0),
                   SupervisionSegment(id='sup-2',
                                      recording_id='irrelevant',
                                      start=7.0,
                                      duration=2.0)
               ])
예제 #11
0
def prepare_supervision_other(
        audio: RecordingSet,
        annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet:
    annotation_by_id = {(key[0]): annot for key, annot in annotations.items()}

    segments = []
    for recording in audio:
        annotation = annotation_by_id.get(recording.id)
        # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm)
        # will share supervision.
        source = recording.sources[0]
        if annotation is None:
            logging.warning(
                f'No annotation found for recording {recording.id}')
            continue

        if (len(source.channels) > 1):
            logging.warning(
                f'More than 1 channels in recording {recording.id}. '
                f'Creating supervision for channel 0 only.')

        for seg_idx, seg_info in enumerate(annotation):
            duration = seg_info.end_time - seg_info.begin_time
            if duration > 0:
                segments.append(
                    SupervisionSegment(id=f'{recording.id}-{seg_idx}',
                                       recording_id=recording.id,
                                       start=seg_info.begin_time,
                                       duration=duration,
                                       channel=0,
                                       language='English',
                                       speaker=seg_info.speaker,
                                       gender=seg_info.gender,
                                       text=seg_info.text))
    return SupervisionSet.from_segments(segments)
예제 #12
0
def parse_utterance(
    audio: Any, root_path: Path
) -> Optional[Tuple[Recording, List[SupervisionSegment]]]:
    sampling_rate = int(audio["sample_rate"])
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=list(range(int(audio["channels"]))),
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(
            duration=Seconds(audio["duration"]), sampling_rate=sampling_rate
        ),
        sampling_rate=sampling_rate,
        duration=Seconds(audio["duration"]),
    )
    segments = []
    for seg in audio["segments"]:
        segments.append(
            SupervisionSegment(
                id=seg["sid"],
                recording_id=audio["aid"],
                start=Seconds(seg["begin_time"]),
                duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8),
                channel=0,
                language="English",
                speaker=seg["speaker"],
                text=seg["text_tn"],
            )
        )
    return recording, segments
예제 #13
0
def create_supervision(
    sessions_and_transcript_path: Tuple[Dict[str, Dict[str, str]], Pathlike]
) -> List[SupervisionSegment]:

    sessions, transcript_path = sessions_and_transcript_path
    transcript_path = Path(transcript_path)
    with codecs.open(transcript_path, "r", "utf8") as trans_f:

        lines = [l.rstrip("\n") for l in trans_f.readlines()][3:]
        lines = [l.split("\t") for l in lines if l.strip() != ""]
        lines = [
            [
                float(l[2]),
                float(l[3]),
                int(l[1]),
                " ".join([w for w in l[7].split() if w.strip() != ""]),
            ]
            for l in lines
        ]

        segments = [
            SupervisionSegment(
                id=transcript_path.stem + "-" + str(k).zfill(len(str(len(lines)))),
                recording_id=transcript_path.stem,
                start=round(l[0], 10),
                duration=round(l[1] - l[0], 10),
                channel=l[2],
                text=l[3],
                language="Spanish",
                speaker=sessions[transcript_path.stem.split("_")[2]][l[2]],
            )
            for k, l in enumerate(lines)
        ]

    return segments
예제 #14
0
def parse_utterance(
        audio: Any,
        root_path: Path
) -> Optional[Tuple[Recording, List[SupervisionSegment]]]:
    # Opus-format audio would be decoded at 48kHz by force, with the original sampling rate being ignored.
    opus_decoding_sample_rate = 48000

    recording = Recording(id=audio['aid'],
                          sources=[AudioSource(type='file',
                                               channels=list(range(int(audio['channels']))),
                                               source=f'{root_path}/{audio["path"]}')],
                          num_samples=round(opus_decoding_sample_rate * Seconds(audio['duration']), ndigits=8),
                          sampling_rate=opus_decoding_sample_rate,
                          duration=Seconds(audio['duration'])).resample(int(audio['sample_rate']))
    segments = []
    for seg in audio['segments']:
        segments.append(SupervisionSegment(id=seg['sid'],
                                           recording_id=audio['aid'],
                                           start=Seconds(seg['begin_time']),
                                           duration=round(Seconds(seg['end_time'] - seg['begin_time']), ndigits=8),
                                           channel=0,
                                           language='English',
                                           speaker=seg['speaker'],
                                           text=seg['text_tn']))
    return recording, segments
예제 #15
0
def parse_utterance(
    audio: Any, root_path: Path, subsets: Sequence
) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]:
    sampling_rate = 16000
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=[0],
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(duration=audio["duration"],
                                        sampling_rate=sampling_rate),
        sampling_rate=sampling_rate,
        duration=audio["duration"],
    )
    segments = defaultdict(dict)
    for sub in subsets:
        segments[sub] = []
    for seg in audio["segments"]:
        segment = SupervisionSegment(
            id=seg["sid"],
            recording_id=audio["aid"],
            start=seg["begin_time"],
            duration=add_durations(seg["end_time"], -seg["begin_time"],
                                   sampling_rate),
            language="Chinese",
            text=seg["text"].strip(),
        )
        for sub in seg["subsets"]:
            if sub in subsets:
                segments[sub].append(segment)
    return recording, segments
예제 #16
0
def dummy_supervision(unique_id: int,
                      start: float = 0.0,
                      duration: float = 1.0) -> SupervisionSegment:
    return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}',
                              recording_id='dummy-recording',
                              start=start,
                              duration=duration)
예제 #17
0
def parse_utterance(
    dataset_split_path: Path,
    line: str,
    alignments: Dict[str, List[AlignmentItem]],
) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    # Create the Recording first
    audio_path = (dataset_split_path /
                  Path(recording_id.replace("-", "/")).parent /
                  f"{recording_id}.flac")
    if not audio_path.is_file():
        logging.warning(f"No such file: {audio_path}")
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        language="English",
        speaker=re.sub(r"-.*", r"", recording.id),
        text=text.strip(),
        alignment={"word": alignments[recording_id]}
        if recording_id in alignments else None,
    )
    return recording, segment
예제 #18
0
def parse_utterance(row: Any, lang_path: Path,
                    language: str) -> Tuple[Recording, SupervisionSegment]:
    # Create the Recording first
    audio_path = lang_path / "clips" / row.path
    if not audio_path.is_file():
        raise ValueError(f"No such file: {audio_path}")
    recording_id = Path(row.path).stem
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        # Look up language code => language name mapping (it is empty at the time of writing this comment)
        # if the language code is unknown, fall back to using the language code.
        language=COMMONVOICE_CODE2LANG.get(language, language),
        speaker=row.client_id,
        text=row.sentence.strip(),
        gender=row.gender if row.gender != "nan" else None,
        custom={
            "age": row.age if row.age != "nan" else None,
            "accent": row.accent if row.accent != "nan" else None,
        },
    )
    return recording, segment
예제 #19
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='sup1',
                           recording_id='rec1',
                           start=3,
                           duration=4,
                           channel=0,
                           text='dummy text')
    ])
예제 #20
0
파일: cut.py 프로젝트: popcornell/lhotse
 def from_dict(data: dict) -> 'Cut':
     feature_info = data.pop('features')
     supervision_infos = data.pop('supervisions')
     return Cut(**data,
                features=Features.from_dict(feature_info),
                supervisions=[
                    SupervisionSegment.from_dict(s)
                    for s in supervision_infos
                ])
예제 #21
0
파일: dummies.py 프로젝트: m-wiesner/lhotse
def dummy_supervision(unique_id: int,
                      start: float = 0.0,
                      duration: float = 1.0,
                      text: str = "irrelevant") -> SupervisionSegment:
    return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}',
                              recording_id=f'dummy-recording-{unique_id:04d}',
                              start=start,
                              duration=duration,
                              text=text)
예제 #22
0
def cut1(dummy_features, dummy_recording):
    return MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=dummy_features,
        recording=dummy_recording,
        supervisions=[
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
        ],
    )
예제 #23
0
def test_create_supervision_segment_with_all_metadata():
    SupervisionSegment(id='X',
                       recording_id='X',
                       start=0.0,
                       duration=0.1,
                       channel=0,
                       text='wysokie szczyty',
                       language='polish',
                       speaker='Janusz',
                       gender='male')
예제 #24
0
def cut2(dummy_features):
    return Cut(id='cut-2',
               start=180.0,
               duration=10.0,
               features=dummy_features,
               supervisions=[
                   SupervisionSegment(id='sup-3',
                                      recording_id='irrelevant',
                                      start=3.0,
                                      duration=2.5)
               ])
예제 #25
0
def dummy_supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="sup1",
            recording_id="rec1",
            start=3,
            duration=4,
            channel=0,
            text="dummy text",
        )
    ])
예제 #26
0
def make_uem_segments(uem_path: Pathlike,
                      recording: Recording) -> List[SupervisionSegment]:
    lines = uem_path.read_text().splitlines()
    return [
        SupervisionSegment(
            id=
            f"{recording.id}-{int(100*float(start)):06d}-{int(100*float(end)):06d}",
            recording_id=recording.id,
            start=float(start),
            duration=round(float(end) - float(start), ndigits=8),
        ) for _, _, start, end in map(str.split, lines)
    ]
예제 #27
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
예제 #28
0
def test_overlay_cut_duration_and_supervisions(offset, expected_duration,
                                               exception_expectation, cut1,
                                               cut2):
    with exception_expectation:
        mixed_cut = cut1.overlay(cut2, offset_other_by=offset)

        assert isinstance(mixed_cut, MixedCut)
        assert mixed_cut.duration == expected_duration
        assert mixed_cut.supervisions == [
            SupervisionSegment(id='sup-1',
                               recording_id='irrelevant',
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id='sup-2',
                               recording_id='irrelevant',
                               start=7.0,
                               duration=2.0),
            SupervisionSegment(id='sup-3',
                               recording_id='irrelevant',
                               start=3.0 + offset,
                               duration=2.5)
        ]
예제 #29
0
def dummy_supervision(
    unique_id: int,
    start: float = 0.0,
    duration: float = 1.0,
    text: str = "irrelevant",
    alignment: Optional[Dict[str, List[AlignmentItem]]] = dummy_alignment()
) -> SupervisionSegment:
    return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}',
                              recording_id=f'dummy-recording-{unique_id:04d}',
                              start=start,
                              duration=duration,
                              text=text,
                              alignment=alignment)
예제 #30
0
def make_corresponding_supervisions(audio: RecordingSet) -> SupervisionSet:
    """
    Prepare a supervision set - in this case it just describes
    which segments are available in the corpus, as the actual supervisions for
    speech separation come from the source recordings.
    """
    return SupervisionSet.from_segments(
        SupervisionSegment(
            id=f"{recording.id}-c{source.channels[0]}",
            recording_id=recording.id,
            start=0.0,
            duration=recording.duration,
            channel=source.channels[0],
        ) for recording in audio for source in recording.sources)