Exemplo n.º 1
0
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 2
0
def parse_icsi_annotations(
    transcripts_dir: Pathlike,
    normalize: str = "upper"
) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:

    annotations = defaultdict(list)
    # In Lhotse, channels are integers, so we map channel ids to integers for each session
    channel_to_idx_map = defaultdict(dict)
    spk_to_channel_map = defaultdict(dict)

    # First we get global speaker ids and channels
    for meeting_file in tqdm(transcripts_dir.rglob("./*.mrt"),
                             desc="Parsing ICSI mrt files"):
        if meeting_file.stem == "preambles":
            continue
        with open(meeting_file) as f:
            meeting_id = meeting_file.stem
            root = ET.parse(f).getroot()  # <Meeting>
            for child in root:
                if child.tag == "Preamble":
                    for grandchild in child:
                        if grandchild.tag == "Channels":
                            channel_to_idx_map[meeting_id] = {
                                channel.attrib["Name"]: idx
                                for idx, channel in enumerate(grandchild)
                            }
                        elif grandchild.tag == "Participants":
                            for speaker in grandchild:
                                # some speakers may not have an associated channel in some meetings, so we
                                # assign them the SDM channel
                                spk_to_channel_map[meeting_id][
                                    speaker.attrib["Name"]] = (
                                        speaker.attrib["Channel"] if "Channel"
                                        in speaker.attrib else "chan6")
                elif child.tag == "Transcript":
                    for segment in child:
                        if len(list(segment)
                               ) == 0 and "Participant" in segment.attrib:
                            start_time = float(segment.attrib["StartTime"])
                            end_time = float(segment.attrib["EndTime"])
                            speaker = segment.attrib["Participant"]
                            channel = spk_to_channel_map[meeting_id][speaker]
                            text = normalize_text(segment.text.strip(),
                                                  normalize=normalize)
                            annotations[(meeting_id, speaker, channel)].append(
                                IcsiSegmentAnnotation(
                                    text,
                                    speaker,
                                    channel,
                                    speaker[0],
                                    start_time,
                                    end_time,
                                ))
    return annotations, channel_to_idx_map
Exemplo n.º 3
0
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int(
                (path.parent / "voice.feats")
                .read_text()
                .splitlines()[1]
                .replace("age ", "")
                .strip()
            )
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 4
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
Exemplo n.º 5
0
def prepare_heroico(
    speech_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    speech_dir = Path(speech_dir)
    transcript_dir = Path(transcript_dir)
    assert speech_dir.is_dir(), f'No such directory: {speech_dir}'
    assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)

    # set some patterns to match fields in transcript files and filenames
    answers_line_pattern = re.compile("\d+/\d+\t.+")
    answers_path_pattern = re.compile('Answers_Spanish')
    heroico_recitations_line_pattern = re.compile("\d+\t.+")
    heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish')
    heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish')
    usma_line_pattern = re.compile("s\d+\t.+")
    usma_native_demo_pattern = re.compile(
        "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+")
    usma_native_path_pattern = re.compile('usma/native')
    usma_native_prompt_id_pattern = re.compile('s\d+')
    usma_nonnative_demo_pattern = re.compile(
        "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+"
    )
    usma_nonnative_path_pattern = re.compile('nonnative.+\.wav')

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)

    transcripts = defaultdict(dict)
    # store answers trnscripts
    answers_trans_path = Path(transcript_dir, heroico_dataset_answers)
    with open(answers_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            # some recordings do not have a transcript, skip them here
            if not answers_line_pattern.match(line):
                continue
            # IDs have the form speaker/prompt_id
            spk_utt, text = line.split(maxsplit=1)
            spk_id, prompt_id = spk_utt.split('/')
            utt_id = '-'.join(['answers', spk_id, prompt_id])
            transcripts[utt_id] = text

    # store heroico recitations transcripts
    heroico_recitations_trans_path = Path(transcript_dir,
                                          heroico_dataset_recordings)
    with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not heroico_recitations_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['heroico-recitations', idx])
            transcripts[utt_id] = text

    # store usma transcripts
    usma_trans_path = Path(transcript_dir, usma_dataset)
    with open(usma_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not usma_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['usma', idx])
            transcripts[utt_id] = text

    # store utterance info
    audio_paths = speech_dir.rglob('*.wav')
    uttdata = {}
    for wav_file in audio_paths:
        wav_path = Path(wav_file)
        path_components = wav_path.parts
        pid = wav_path.stem
        if re.findall(answers_path_pattern, str(wav_file)):
            # store utternce info for Heroico Answers
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['answers', spk, pid])
            if utt_id not in transcripts:
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='answers',
                                             utterance_id=utt_id,
                                             transcript=transcripts[utt_id])
        elif re.findall(usma_native_path_pattern, str(wav_file)):
            # store utterance info for usma native data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_native_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
            if not usma_native_prompt_id_pattern.match(pid):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif re.findall(usma_nonnative_path_pattern, str(wav_file)):
            # store utterance data for usma nonnative data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_nonnative_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) <= 354 or int(pid) >= 562:
            # store utterance info for heroico recitations for train dataset
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations', spk, pid])
            trans_id = '-'.join(['heroico-recitations', pid])
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='heroico-recitations',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) > 354 and int(pid) < 562:
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations-repeats', spk, pid])
            trans_id = '-'.join(['heroico-recitations-repeats', pid])
            uttdata[str(wav_file)] = UttInfo(
                fold='devtest',
                speaker=spk,
                prompt_id=pid,
                subcorpus='heroico-recitations-repeats',
                utterance_id=utt_id,
                transcript=transcripts[trans_id])
        else:
            logging.warning(f'No such file: {wav_file}')

    audio_paths = speech_dir.rglob('*.wav')
    audio_files = [w for w in audio_paths]

    for fld in folds:
        metadata = {}
        for wav_file in audio_files:
            wav_path = Path(wav_file)
            # skip files with no record
            if not uttdata[str(wav_file)]:
                continue
            # only process the current fold
            if uttdata[str(wav_file)].fold != fld:
                continue
            path_components = wav_path.parts
            prompt_id = wav_path.stem
            # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
            # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
            info = soundfile.info(str(wav_file))
            spk = wav_path.parts[-2]
            utt_id = '-'.join(
                [uttdata[str(wav_file)].subcorpus, spk, prompt_id])
            metadata[utt_id] = HeroicoMetaData(
                audio_path=wav_file,
                audio_info=info,
                text=uttdata[str(wav_file)].transcript)

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.samplerate),
                      num_samples=metadata[idx].audio_info.frames,
                      duration=metadata[idx].audio_info.duration)
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='Spanish',
                               speaker=idx.split('-')[-2],
                               text=metadata[idx].text)
            for idx in audio.recordings)

        validate_recordings_and_supervisions(audio, supervision)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{fld}.json')
            audio.to_json(output_dir / f'recordings_{fld}.json')

        manifests[fld] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 6
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
Exemplo n.º 7
0
def prepare_icsi(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "ihm",
    normalize_text: str = "kaldi",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param audio_dir: Pathlike, the path which holds the audio data
    :param transcripts_dir: Pathlike, the path which holds the transcripts data
    :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.
    """
    audio_dir = Path(audio_dir)
    transcripts_dir = Path(transcripts_dir)

    assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
    assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
    assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info("Parsing ICSI transcripts")
    annotations, channel_to_idx_map = parse_icsi_annotations(
        transcripts_dir, normalize=normalize_text)

    # Audio
    logging.info("Preparing recording manifests")

    channels = "".join(MIC_TO_CHANNELS[mic])
    if mic == "ihm" or mic == "mdm":
        audio_paths = audio_dir.rglob(f"chan[{channels}].sph")
        audio = prepare_audio_grouped(
            list(audio_paths), channel_to_idx_map if mic == "ihm" else None)
    elif mic == "sdm" or mic == "ihm-mix":
        audio_paths = (audio_dir.rglob(f"chan[{channels}].sph")
                       if len(channels) else audio_dir.rglob("*.wav"))
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info("Preparing supervision manifests")
    supervision = (prepare_supervision_ihm(
        audio, annotations, channel_to_idx_map) if mic == "ihm" else
                   prepare_supervision_other(audio, annotations))

    manifests = defaultdict(dict)

    for part in ["train", "dev", "test"]:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in PARTITIONS[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in PARTITIONS[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_file(output_dir / f"recordings_{part}.jsonl")
            supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl")

        audio_part, supervision_part = fix_manifests(audio_part,
                                                     supervision_part)
        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            "recordings": audio_part,
            "supervisions": supervision_part
        }

    return dict(manifests)
Exemplo n.º 8
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}")
        for wav in corpus_dir.rglob("*.wav"))
    supervisions = []
    for path in corpus_dir.rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = "suitcase_corpus" in path.parts

        speaker = (path.parent.parent.name.lower()
                   )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = (f"suitcase_corpus-{speaker}"
                  if is_suitcase_corpus else f"{speaker}-{path.stem}")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=speaker_meta[speaker]["gender"],
                custom={"accent": speaker_meta[speaker]["native_lang"]},
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        "read": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" not in r.id),
            "supervisions":
            supervisions.filter(
                lambda s: "suitcase_corpus" not in s.recording_id),
        },
        "suitcase": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" in r.id),
            "supervisions":
            supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id),
        },
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests["recordings"].to_file(
                output_dir / f"l2-arctic_recordings_{key}.jsonl.gz")
            manifests["supervisions"].to_file(
                output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz")

    return splits