Exemplo n.º 1
0
def test_extend_by_cut_with_supervision(
    cut_start,
    cut_duration,
    extend_duration,
    extend_direction,
    supervision_start,
    supervision_duration,
    expected_start,
    expected_end,
):
    recording = dummy_recording(int(uuid4()), duration=1.0)
    supervisions = SupervisionSet.from_segments([
        SupervisionSegment(
            id=int(uuid4()),
            recording_id=recording.id,
            start=supervision_start,
            duration=supervision_duration,
        )
    ])
    cut = dummy_cut(int(uuid4()),
                    start=cut_start,
                    duration=cut_duration,
                    supervisions=supervisions)
    extended_cut = cut.extend_by(duration=extend_duration,
                                 direction=extend_direction)
    assert isclose(extended_cut.supervisions[0].start, expected_start)
    assert isclose(extended_cut.supervisions[0].end, expected_end)
Exemplo n.º 2
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="segment-1",
            recording_id="recording-1",
            channel=0,
            start=0.1,
            duration=0.3,
            text="transcript of the first segment",
            language="english",
            speaker="Norman Dyhrentfurth",
            gender="male",
            alignment={
                "word": [
                    AlignmentItem(symbol="transcript",
                                  start=0.1,
                                  duration=0.08),
                    AlignmentItem(symbol="of", start=0.18, duration=0.02),
                    AlignmentItem(symbol="the", start=0.2, duration=0.03),
                    AlignmentItem(symbol="first", start=0.23, duration=0.07),
                    AlignmentItem(symbol="segment", start=0.3, duration=0.1),
                ]
            },
        )
    ])
Exemplo n.º 3
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male',
                           alignment={
                               'word': [
                                   AlignmentItem(symbol='transcript',
                                                 start=0.1,
                                                 duration=0.08),
                                   AlignmentItem(symbol='of',
                                                 start=0.18,
                                                 duration=0.02),
                                   AlignmentItem(symbol='the',
                                                 start=0.2,
                                                 duration=0.03),
                                   AlignmentItem(symbol='first',
                                                 start=0.23,
                                                 duration=0.07),
                                   AlignmentItem(symbol='segment',
                                                 start=0.3,
                                                 duration=0.1),
                               ]
                           })
    ])
Exemplo n.º 4
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav)
        for wav in (corpus_dir / "wav48").rglob("*.wav"))
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 5
0
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ("train", "dev", "test"):
        root = tedlium_root / "legacy" / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / "sph").glob("*.sph"))
        stms = list((root / "stm").glob("*.stm"))
        assert len(stms) == len(recordings), (
            f"Mismatch: found {len(recordings)} "
            f"sphere files and {len(stms)} STM files. "
            f"You might be missing some parts of TEDLIUM...")
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
                    if text == "ignore_time_segment_in_scoring":
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f"{rec_id}-{idx}",
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language="English",
                            speaker=rec_id,
                        ))
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_file(output_dir /
                               f"tedlium_recordings_{split}.jsonl.gz")
            supervisions.to_file(output_dir /
                                 f"tedlium_supervisions_{split}.jsonl.gz")

    return corpus
Exemplo n.º 6
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
Exemplo n.º 7
0
def prepare_tedlium(
        tedlium_root: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ('train', 'dev', 'test'):
        root = tedlium_root / 'legacy' / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / 'sph').glob('*.sph')
        )
        stms = list((root / 'stm').glob('*.stm'))
        assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \
                                             f'sphere files and {len(stms)} STM files. ' \
                                             f'You might be missing some parts of TEDLIUM...'
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = ' '.join(words).replace('{NOISE}', '[NOISE]')
                    if text == 'ignore_time_segment_in_scoring':
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f'{rec_id}-{idx}',
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language='English',
                            speaker=rec_id,
                        )
                    )
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_json(output_dir / f'{split}_recordings.json')
            supervisions.to_json(output_dir / f'{split}_supervisions.json')

    return corpus
Exemplo n.º 8
0
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None):
    manifests = defaultdict(dict)
    for split in ('dev', 'eval', 'training'):
        audio_dir = corpus_dir / f'conversational/{split}/audio'
        recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph'))
        if len(recordings) == 0:
            logging.warning(f"No SPHERE files found in {audio_dir}")
        manifests[split]['recordings'] = recordings

        supervisions = []
        text_dir = corpus_dir / f'conversational/{split}/transcription'
        for p in text_dir.glob('*'):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_')
            channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A')
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines = p.read_text().splitlines() + [None]
            for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                start = float(timestamp[1:-1])
                end = float(next_timestamp[1:-1])
                supervisions.append(
                    SupervisionSegment(
                        id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}',
                        recording_id=p.stem,
                        start=start,
                        duration=round(end - start, ndigits=8),
                        channel=0,
                        text=normalize_text(text),
                        language=BABELCODE2LANG[lang_code],
                        speaker=speaker,
                    )
                )
        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(
            manifests[split]['recordings'],
            manifests[split]['superevisions']
        )

        if output_dir is not None:
            language = BABELCODE2LANG[lang_code]
            if split == 'training':
                split = 'train'
            manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json')
            manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json')

    return manifests
Exemplo n.º 9
0
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 10
0
def prepare_norm_cn(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    num_jobs: int = 15,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    dataset_parts = ["dev", "test", "train"]
    for part in dataset_parts:
        transcript_path = corpus_dir / f"{part}/text.txt"
        transcript_dict = {}
        with open(transcript_path, "r", encoding="utf-8") as f:
            for line in f.readlines():
                idx_transcript = line.split()
                if len(idx_transcript) < 2 :
                    logging.info(f"get transcript err: {line}")
                    continue
                transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
        file_path = corpus_dir / f"{part}/wav.scp"
        file_paths = []
        with open(file_path, "r", encoding="utf-8") as f:
            file_paths = [line.strip() for line in f]

        recordings = []
        supervisions = []
        with ThreadPoolExecutor(num_jobs) as ex:
            for recording, supervision in tqdm(
                ex.map(
                    process_file,
                    file_paths,
                    repeat(transcript_dict),
                ),
                desc="Processing NormcnSpeech JSON entries",
                leave=False,
            ):
            #for p in file_paths:
            #    recording, supervision = process_file(p, transcript_dict)
                if recording is not None :
                    recordings.append(recording)
                    supervisions.append(supervision)

        supervision_set = SupervisionSet.from_segments(supervisions)
        recording_set = RecordingSet.from_recordings(recordings)
        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")
        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
    return manifests
Exemplo n.º 11
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male')
    ])
Exemplo n.º 12
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid

    recordings = []
    supervisions = []
    for audio_path in tqdm(
        (part3_path / "AudioSameCloseMic").glob("*.wav"),
        desc="Creating manifests for SameCloseMic",
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(
                part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16"
            )
            segments = [
                s
                for s in (
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(
                            round(segment.xmax - segment.xmin, ndigits=8),
                            recording.duration - segment.xmin,
                        ),
                        text=segment.text,
                        language="Singaporean English",
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ("<S>", "<Z>")  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f"Error when processing {audio_path} - skipping...")
    return {
        "recordings": RecordingSet.from_recordings(recordings),
        "supervisions": SupervisionSet.from_segments(supervisions),
    }
Exemplo n.º 13
0
def _prepare_voxceleb_v1(
    corpus_path: Pathlike,
    num_jobs: int,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with
    2 splits: train ("dev") and test.
    """
    speaker_metadata = {}
    with open(corpus_path / "vox1_meta.csv", "r") as f:
        next(f)
        for line in f:
            spkid, name, gender, nationality, split = line.strip().split("\t")
            speaker_metadata[spkid] = SpeakerMetadata(
                id=spkid, name=name, gender=gender, nationality=nationality, split=split
            )
    with ProcessPoolExecutor(num_jobs) as ex:
        recordings = []
        supervisions = []
        futures = []
        for p in (corpus_path / "wav").rglob("*.wav"):
            futures.append(ex.submit(_process_file, p, speaker_metadata))
        for future in tqdm(
            as_completed(futures),
            total=len(futures),
            desc="Processing VoxCeleb1",
            leave=False,
        ):
            recording, supervision = future.result()
            recordings.append(recording)
            supervisions.append(supervision)
        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
    manifests = defaultdict(dict)
    # Split into dev and test sets based on the split of the speakers.
    for split in ("dev", "test"):
        manifests[split]["supervisions"] = supervision_set.filter(
            lambda s: s.custom["split"] == split
        )
        split_ids = [s.recording_id for s in manifests[split]["supervisions"]]
        manifests[split]["recordings"] = recording_set.filter(
            lambda r: r.id in split_ids
        )
    manifests["train"] = manifests.pop("dev")
    return manifests
Exemplo n.º 14
0
def _prepare_voxceleb_v2(
    corpus_path: Pathlike,
    num_jobs: int,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb2 corpus. The manifests are created the same dict
    without any splits since the whole data is used in the final "train" split.
    """
    # Read the speaker metadata.
    speaker_metadata = {}
    with open(corpus_path / "vox2_meta.csv", "r") as f:
        next(f)
        for line in f:
            spkid, _, gender, split = map(str.strip, line.split(","))
            speaker_metadata[spkid] = SpeakerMetadata(id=spkid,
                                                      name="",
                                                      gender=gender,
                                                      nationality="",
                                                      split=split)
    # Read the wav files and prepare manifests
    with ProcessPoolExecutor(num_jobs) as ex:
        recordings = []
        supervisions = []
        futures = []
        for p in (corpus_path / split).glob("*.wav"):
            futures.append(
                ex.submit(_process_file, p, speaker_metadata, type="command"))
        for future in tqdm(
                futures,
                total=len(futures),
                desc=f"Processing VoxCeleb2 {split} split...",
                leave=False,
        ):
            recording, supervision = future.result()
            recordings.append(recording)
            supervisions.append(supervision)
    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)
    manifests = {
        "recordings": recording_set,
        "supervisions": supervision_set,
    }
    return manifests
Exemplo n.º 15
0
def prepare_music(
    corpus_dir: Path, use_vocals: bool = True
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    music_dir = corpus_dir / "music"
    recordings = scan_recordings(music_dir)
    supervisions = SupervisionSet.from_segments(
        SupervisionSegment(
            id=utt,
            recording_id=utt,
            start=0,
            duration=recordings.duration(utt),
            speaker=musician,
            custom={"genres": genres.split(","), "vocals": vocals == "Y"},
        )
        for file in music_dir.rglob("ANNOTATIONS")
        for utt, genres, vocals, musician in read_annotations(file, max_fields=4)
    )
    if not use_vocals:
        supervisions = supervisions.filter(lambda s: s.custom["vocals"] is False)
    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 16
0
def prepare_separate_phone_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'),
            desc='Creating manifests for SeparateIVR'
    ):
        try:
            recording_id = f'{audio_path.parent.name}_{audio_path.stem}'
            recording = Recording.from_file(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            supervisions.extend(segments)
            recordings.append(recording)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Exemplo n.º 17
0
def read_rttm(path: Pathlike) -> SupervisionSet:
    lines = Path(path).read_text().splitlines()
    sups = []
    rec_cntr = Counter()
    for line in lines:
        _, recording_id, channel, start, duration, _, _, speaker, _, _ = line.split(
        )
        start, duration, channel = float(start), float(duration), int(channel)
        if duration == 0.0:
            continue
        rec_cntr[recording_id] += 1
        sups.append(
            SupervisionSegment(
                id=f"{recording_id}_{rec_cntr[recording_id]}",
                recording_id=recording_id,
                start=start,
                duration=duration,
                channel=channel,
                speaker=f"{recording_id}_{speaker}",
                language="English",
            ))
    return SupervisionSet.from_segments(sups)
Exemplo n.º 18
0
def prepare_music(
        corpus_dir: Path,
        use_vocals: bool = True
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    music_dir = corpus_dir / 'music'
    recordings = scan_recordings(music_dir)
    supervisions = SupervisionSet.from_segments(
        SupervisionSegment(id=utt,
                           recording_id=utt,
                           start=0,
                           duration=recordings.duration(utt),
                           speaker=musician,
                           custom={
                               'genres': genres.split(','),
                               'vocals': vocals == 'Y'
                           }) for file in music_dir.rglob('ANNOTATIONS')
        for utt, genres, vocals, musician in read_annotations(file,
                                                              max_fields=4))
    if not use_vocals:
        supervisions = supervisions.filter(
            lambda s: s.custom['vocals'] is False)
    return {'recordings': recordings, 'supervisions': supervisions}
Exemplo n.º 19
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSameCloseMic').glob('*.wav'),
            desc='Creating manifests for SameCloseMic'
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_wav(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        duration=round(segment.xmax - segment.xmin, ndigits=8),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Exemplo n.º 20
0
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int(
                (path.parent / "voice.feats")
                .read_text()
                .splitlines()[1]
                .replace("age ", "")
                .strip()
            )
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 21
0
def prepare_single_mtedx_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    language: str = "language",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single MTEDx language.

    This function works as follows:

        - First it looks for the audio directory in the data/wav where the .flac
            files are stored.
        - Then, it looks for the vtt directory in data/{train,dev,test}/vtt
            which contains the segmentation and transcripts for the audio.
        - The transcripts undergo some basic text normalization

    :param corpus_dir: Path to the root of the MTEDx download
    :param output_dir: Path where the manifests are stored as .json files
    :param language: The two-letter language code.
    :param num_jobs: Number of threads to use when preparing data.
    :return:
    """
    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)
    manifests = defaultdict(dict)

    with ThreadPoolExecutor(num_jobs) as ex:
        for split in ("train", "valid", "test"):
            audio_dir = corpus_dir / f"data/{split}/wav"
            recordings = RecordingSet.from_recordings(
                Recording.from_file(p) for p in audio_dir.glob("*.flac")
            )
            if len(recordings) == 0:
                logging.warning(f"No .flac files found in {audio_dir}")

            supervisions = []
            text_dir = corpus_dir / f"data/{split}/vtt"
            futures = []
            for p in text_dir.glob("*"):
                futures.append(ex.submit(_filename_to_supervisions, p, language))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                for sup in result:
                    supervisions.append(sup)

            if len(supervisions) == 0:
                logging.warning(f"No supervisions found in {text_dir}")
            supervisions = SupervisionSet.from_segments(supervisions)

            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions
            )
            supervisions = trim_supervisions_to_recordings(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            manifests[split] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

            if output_dir is not None:
                if isinstance(output_dir, str):
                    output_dir = Path(output_dir)
                output_dir.mkdir(parents=True, exist_ok=True)
                save_split = "dev" if split == "valid" else split
                recordings.to_file(output_dir / f"recordings_{language}_{split}.json")
                supervisions.to_file(
                    output_dir / f"supervisions_{language}_{split}.json"
                )

    return dict(manifests)
Exemplo n.º 22
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
Exemplo n.º 23
0
def prepare_libricss(
    corpus_dir: Pathlike,
    output_dir: Pathlike = None,
    type: str = "mdm",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    """
    assert type in ["mdm", "ihm-mix", "ihm"]

    manifests = {}

    corpus_dir = Path(corpus_dir)
    corpus_dir = (corpus_dir / "for_release"
                  if corpus_dir.stem != "for_release" else corpus_dir)

    recordings = []
    segments = []

    for ov in OVERLAP_RATIOS:
        for session in (corpus_dir / ov).iterdir():
            _, _, _, _, _, name, actual_ov = session.name.split("_")
            actual_ov = float(actual_ov.split("actual")[1])
            recording_id = f"{ov}_{name}"
            audio_path = (session / "clean" /
                          "mix.wav" if type == "ihm-mix" else session /
                          "clean" /
                          "each_spk.wav" if type == "ihm" else session /
                          "record" / "raw_recording.wav")
            recordings.append(
                Recording.from_file(audio_path, recording_id=recording_id))
            for idx, seg in enumerate(
                    parse_transcript(session / "transcription" /
                                     "meeting_info.txt")):
                segments.append(
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=seg[0],
                        duration=seg[1] - seg[0],
                        text=seg[4],
                        language="English",
                        speaker=seg[2],
                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
                        if type == "ihm" else 0,
                    ))

    supervisions = SupervisionSet.from_segments(segments)
    recordings = RecordingSet.from_recordings(recordings)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        recordings.to_jsonl(output_dir / "recordings.jsonl")
        supervisions.to_jsonl(output_dir / "supervisions.jsonl")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 24
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Exemplo n.º 25
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = 'auto',
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    if dataset_parts == 'auto':
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                                   output_dir=output_dir,
                                                   prefix='libritts')
        if maybe_manifests is not None:
            return maybe_manifests

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split('|') for line in (
            corpus_dir / 'SPEAKERS.txt').read_text().splitlines()
                                   if not line.startswith(';'))
    }

    manifests = defaultdict(dict)
    for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'):
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           '*.wav',
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob('*.trans.tsv'),
                desc='Scanning transcript files (progbar per speaker)',
                leave=False):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = {
                rec_id: float(snr)
                for rec_id, *_, snr in map(str.split, (
                    trans_path.parent /
                    trans_path.name.replace('.trans.tsv', '.book.tsv')
                ).read_text().splitlines())
            }
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split('\t')
                spk_id = rec_id.split('_')[0]
                supervisions.append(
                    SupervisionSegment(id=rec_id,
                                       recording_id=rec_id,
                                       start=0.0,
                                       duration=recordings[rec_id].duration,
                                       channel=0,
                                       text=norm_text,
                                       language='English',
                                       speaker=spk_id,
                                       gender=spk2gender[spk_id],
                                       custom={
                                           'orig_text': orig_text,
                                           'snr': utt2snr[rec_id]
                                       }))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_json(output_dir /
                                 f'libritts_supervisions_{part}.json')
            recordings.to_json(output_dir / f'libritts_recordings_{part}.json')

        manifests[part] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return dict(manifests)  # Convert to normal dict
Exemplo n.º 26
0
def prepare_mgb2(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    text_cleaning: bool = True,
    buck_walter: bool = False,
    num_jobs: int = 1,
    mer_thresh: int = 80,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe).
    :param buck_walter: Bool, use BuckWalter transliteration
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :param mer_thresh: int, filter out segments based on mer (Match Error Rate)
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `text_cleaning` option removes all punctuation and diacritics.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    dataset_parts = ["dev", "train", "test"]
    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(
            dataset_parts=dataset_parts,
            output_dir=output_dir,
            prefix="mgb2",
            suffix="jsonl.gz",
            lazy=True,
        )

    for part in dataset_parts:
        info(f"Processing MGB2 subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz"
        ):
            info(f"MGB2 subset: {part} already prepared - skipping.")
            continue

        # Read the recordings and write them into manifest. We additionally store the
        # duration of the recordings in a dict which will be used later to create the
        # supervisions.

        output_dir = Path(output_dir)
        corpus_dir = Path(corpus_dir)
        if part == "test" or part == "dev":
            (output_dir / part).mkdir(parents=True, exist_ok=True)
            copy(
                corpus_dir / part / "text.non_overlap_speech",
                output_dir / part / "text",
            )
            copy(
                corpus_dir / part / "segments.non_overlap_speech",
                output_dir / part / "segments",
            )
            with open(corpus_dir / part / "wav.scp", "r") as f_in, open(
                output_dir / part / "wav.scp", "w"
            ) as f_out:
                for line in f_in:
                    f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/"))
                    f_out.write("\n")

            recordings, supervisions, _ = load_kaldi_data_dir(
                (output_dir / part), 16000
            )
            if buck_walter is False:
                supervisions = supervisions.transform_text(from_buck_walter)
            if part == "test":
                assert (
                    len(supervisions) == 5365
                ), f"Expected 5365 supervisions for test, found {len(supervisions)}"
            elif part == "dev":
                assert (
                    len(supervisions) == 5002
                ), f"Expected 5002 supervisions for dev, found {len(supervisions)}"
        elif part == "train":
            recordings = RecordingSet.from_dir(
                (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs
            )

            xml_paths = check_and_rglob(
                path.join(corpus_dir, part, "xml/utf8"), "*.xml"
            )
            # Read supervisions and write them to manifest
            with recursion_limit(5000):
                supervisions_list = list(
                    chain.from_iterable(
                        [make_supervisions(p, mer_thresh) for p in xml_paths]
                    )
                )

            supervisions = SupervisionSet.from_segments(supervisions_list)

            assert (
                len(supervisions) == 375103
            ), f"Expected 375103 supervisions for train, found {len(supervisions)}"

            if text_cleaning is True:
                supervisions = supervisions.transform_text(cleaning)
            recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        # saving recordings and supervisions
        recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz"))
        supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz"))

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }
    return manifests
Exemplo n.º 27
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
Exemplo n.º 28
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Exemplo n.º 29
0
def prepare_bvcc(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)

    phase1_main = (corpus_dir / "phase1-main").resolve()
    assert phase1_main.exists(), f"Main track dir is missing {phase1_main}"

    main1_sets = phase1_main / "DATA" / "sets"
    main1_wav = phase1_main / "DATA" / "wav"
    assert (main1_sets.exists() and main1_wav.exists()
            ), f"Have you run data preparation in {phase1_main}?"
    main1_devp = main1_sets / "DEVSET"
    assert main1_devp.exists(), main1_devp
    main1_trainp = main1_sets / "TRAINSET"
    assert main1_trainp.exists(), main1_trainp

    phase1_ood = (corpus_dir / "phase1-ood").resolve()
    assert phase1_ood.exists(
    ), f"Out of domain track dir is missing {phase1_ood}"
    ood1_sets = phase1_ood / "DATA" / "sets"
    ood1_wav = phase1_ood / "DATA" / "wav"
    assert (ood1_sets.exists() and ood1_wav.exists()
            ), f"Have you run data preparation in {phase1_ood}?"
    ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt"
    assert ood1_unlabeled.exists(), ood1_unlabeled
    ood1_devp = ood1_sets / "DEVSET"
    assert ood1_devp.exists(), ood1_devp
    ood1_trainp = ood1_sets / "TRAINSET"
    assert ood1_trainp.exists(), ood1_devp

    manifests = {}

    # ### Main track sets
    main1_recs = RecordingSet.from_dir(main1_wav,
                                       pattern="*.wav",
                                       num_jobs=num_jobs)

    logging.info("Preparing main1_dev")
    main1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_devp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
    manifests["main1_dev"] = {
        "recordings": main1_dev_recs,
        "supervisions": main1_dev_sup,
    }

    logging.info("Preparing main1_train")
    main1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_trainp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
    manifests["main1_train"] = {
        "recordings": main1_train_recs,
        "supervisions": main1_train_sup,
    }

    # ### Out of Domain (OOD) track sets
    unlabeled_wavpaths = [
        ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines()
    ]
    manifests["ood1_unlabeled"] = {
        "recordings":
        RecordingSet.from_recordings(
            Recording.from_file(p) for p in unlabeled_wavpaths)
    }

    ood1_recs = RecordingSet.from_dir(ood1_wav,
                                      pattern="*.wav",
                                      num_jobs=num_jobs)

    logging.info("Preparing ood1_dev")
    ood1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_devp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
    manifests["ood1_dev"] = {
        "recordings": ood1_dev_recs,
        "supervisions": ood1_dev_sup,
    }

    logging.info("Preparing ood1_train")
    ood1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_trainp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
    manifests["ood1_train"] = {
        "recordings": ood1_train_recs,
        "supervisions": ood1_train_sup,
    }

    # Optionally serializing to disc
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part, d in manifests.items():
            d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz")
            if "supervisions" in d:
                d["supervisions"].to_file(output_dir /
                                          f"supervisions_{part}.jsonl.gz")

    return manifests
Exemplo n.º 30
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
    link_previous_utt: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :param link_previous_utt: If true adds previous utterance id to supervisions.
        Useful for reconstructing chains of utterances as they were read.
        If previous utterance was skipped from LibriTTS datasets previous_utt label is None.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "auto":
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir,
                                             prefix="libritts")

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split("|") for line in (
            corpus_dir / "SPEAKERS.txt").read_text().splitlines()
                                   if not line.startswith(";"))
    }

    for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"):
        if manifests_exist(part=part, output_dir=output_dir,
                           prefix="libritts"):
            logging.info(
                f"LibriTTS subset: {part} already prepared - skipping.")
            continue
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           "*.wav",
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob("*.trans.tsv"),
                desc="Scanning transcript files (progbar per speaker)",
                leave=False,
        ):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map(
                str.split,
                (trans_path.parent /
                 trans_path.name.replace(".trans.tsv", ".book.tsv")
                 ).read_text().splitlines(),
            )]
            # keeps the order of uttids as they appear in book.tsv
            uttids = [r for r, _ in utt2snr]
            utt2snr = dict(utt2snr)

            if link_previous_utt:
                # Using the property of sorted keys to find previous utterance
                # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001
                utt2prevutt = dict(zip(uttids + [None], [None] + uttids))

            prev_rec_id = None
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split("\t")
                spk_id = rec_id.split("_")[0]
                customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]}
                if link_previous_utt:
                    # all recordings ids should be in the book.csv
                    # but they are some missing e.g. 446_123502_000030_000003
                    prev_utt = utt2prevutt.get(rec_id, None)
                    # previous utterance has to be present in trans.csv - otherwise it was skipped
                    prev_utt = prev_utt if prev_utt == prev_rec_id else None
                    customd["prev_utt"] = prev_utt
                    prev_rec_id = rec_id
                supervisions.append(
                    SupervisionSegment(
                        id=rec_id,
                        recording_id=rec_id,
                        start=0.0,
                        duration=recordings[rec_id].duration,
                        channel=0,
                        text=norm_text,
                        language="English",
                        speaker=spk_id,
                        gender=spk2gender[spk_id],
                        custom=customd,
                    ))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir /
                                 f"libritts_supervisions_{part}.jsonl.gz")
            recordings.to_file(output_dir /
                               f"libritts_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests