예제 #1
0
def prepare_switchboard(
        audio_dir: Pathlike,
        transcripts_dir: Optional[Pathlike] = None,
        sentiment_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        omit_silence: bool = True,
        absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']})

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups
    )
    supervisions = SupervisionSet.from_segments(chain.from_iterable(
        make_segments(
            transcript_path=group[f'text-{channel}'],
            recording=recording,
            channel=channel,
            omit_silence=omit_silence
        )
        for group, recording in zip(groups, recordings)
        for channel in [0, 1]
    ))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
예제 #2
0
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, "*.sph")
    sgml_paths = check_and_rglob(transcripts_dir, "*.sgml")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths
    )

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["sections"] for sups in supervisions_list)
    )
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["segments"] for sups in supervisions_list)
    )

    validate_recordings_and_supervisions(recordings, segment_supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz")
        section_supervisions.to_file(
            output_dir / "broadcast-news_sections_all.jsonl.gz"
        )
        segment_supervisions.to_file(
            output_dir / "broadcast-news_segments_all.jsonl.gz"
        )

    return {
        "recordings": recordings,
        "sections": section_supervisions,
        "segments": segment_supervisions,
    }
예제 #3
0
def prepare_callhome_english(
        audio_dir: Pathlike,
        rttm_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / 'fullref.rttm'
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, '*.sph')
    recordings = RecordingSet.from_recordings(
        make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)
    )

    recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions)
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
예제 #4
0
def prepare_callhome_english_sre(
    audio_dir: Pathlike,
    rttm_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / "fullref.rttm"
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, "*.sph")
    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 4)
        for p in tqdm(audio_paths))

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")
    return {"recordings": recordings, "supervisions": supervisions}
예제 #5
0
def prepare_dihard3(
    dev_audio_dir: Pathlike,
    eval_audio_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    uem_manifest: Optional[bool] = True,
    num_jobs: Optional[int] = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the DIHARD III corpus.
    We create two manifests: one with recordings, and the other one with supervisions containing speaker id
    and timestamps.

    :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g.
        /data/corpora/LDC/LDC2020E12
    :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g.
        /data/corpora/LDC/LDC2021E02`
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in
        dataset.DiarizationDataset)
    :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    manifests = defaultdict(dict)
    for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"):
        audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir
        if audio_dir is None or not Path(audio_dir).exists():
            logging.warning(f"Nothing to be done for {part}")
            continue
        rttm_paths = list(check_and_rglob(audio_dir, "*.rttm"))
        uem_paths = list(check_and_rglob(audio_dir, "*.uem"))

        recordings = RecordingSet.from_dir(audio_dir,
                                           "*.flac",
                                           num_jobs=num_jobs)

        # Read metadata for recordings
        metadata = parse_metadata(
            list(check_and_rglob(audio_dir, "recordings.tbl"))[0])

        supervisions = SupervisionSet.from_segments(
            chain.from_iterable(
                make_rttm_segments(
                    rttm_path=[
                        x for x in rttm_paths if x.stem == recording.id
                    ][0],
                    recording=recording,
                    metadata=metadata[recording.id],
                ) for recording in recordings))
        if uem_manifest:
            uem = SupervisionSet.from_segments(
                chain.from_iterable(
                    make_uem_segments(
                        uem_path=[
                            x for x in uem_paths if x.stem == recording.id
                        ][0],
                        recording=recording,
                    ) for recording in recordings))

        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{part}.json")
            supervisions.to_json(output_dir / f"supervisions_{part}.json")
            if uem_manifest:
                uem.to_json(output_dir / f"uem_{part}.json")
        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions
        }
        if uem_manifest:
            manifests[part].update({"uem": uem})
    return manifests
예제 #6
0
def prepare_gale_mandarin(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    segment_words: Optional[bool] = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Mandarin Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param segment_words: Use `jieba` package to perform word segmentation (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable(
                [
                    check_and_rglob(dir, ext, strict=False)
                    for dir in audio_dirs
                    for ext in ["*.wav", "*.flac"]
                ]
            )
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs]
    )

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values()
    )

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths, segment_words=segment_words)
    ).filter(lambda s: s.recording_id in audio_paths)

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    TEST = [
        line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url)
    ]

    manifests = defaultdict(dict)
    manifests["dev"] = {
        "recordings": recordings.filter(lambda r: r.id in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests["train"] = {
        "recordings": recordings.filter(lambda r: r.id not in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSONL files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "dev"]:
            manifests[part]["recordings"].to_file(
                output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz"
            )
            manifests[part]["supervisions"].to_file(
                output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz"
            )

    return manifests
예제 #7
0
def prepare_cslu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    normalize_text: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CSLU Kids corpus. The supervision contains either the
    prompted text, or a transcription of the spontaneous speech, depending on
    whether the utterance was scripted or spontaneous.

    Additionally, the following information is present in the `custom` tag:
    scripted/spontaneous utterance, and verification label (rating between 1 and 4)
    for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt
    or top documentation in this script for more information).

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir

    # Get list of all recordings
    audio_paths = check_and_rglob(corpus_dir, "*.wav")

    # Read verification labels
    verification = {}
    for file in check_and_rglob(corpus_dir, "*-verified.txt"):
        with open(file, "r") as f:
            for line in f:
                path, label = line.strip().split()
                utt = Path(path).stem
                verification[utt] = int(label)

    # Read prompted transcriptions
    prompts = {}
    with open(corpus_dir / "docs" / "all.map", "r") as f:
        for line in f:
            if line.strip() != "":
                prompt, text = line.strip().split(maxsplit=1)
                prompts[prompt] = text[1:-1]  # remove " " around the text

    recordings = []
    supervisions = []
    for p in tqdm(audio_paths, desc="Preparing manifests"):

        # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav
        uttid = p.stem  # ks001000
        spk = p.parent.stem  # ks001
        cat = p.parent.parent.stem  # 0
        prompt = p.parent.parent.parent.stem  # 00
        type = p.parent.parent.parent.parent.stem  # scripted

        recording = Recording.from_file(
            p, relative_path_depth=None if absolute_paths else 3
        )
        recordings.append(recording)

        if type == "scripted":
            text = prompts[prompt]
            verification_label = verification[uttid] if uttid in verification else None
            custom = {"type": type, "verification_label": verification_label}
        elif type == "spontaneous":
            text = read_text(
                corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt",
                normalize=normalize_text,
            )
            custom = {"type": type}
        supervisions.append(
            SupervisionSegment(
                id=uttid,
                recording_id=uttid,
                start=0,
                duration=recording.duration,
                speaker=spk,
                language="English",
                text=text,
                custom=custom,
            )
        )

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        "recordings": recordings,
        "supervisions": supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / "recordings.json")
        manifests["supervisions"].to_json(output_dir / "supervisions.json")

    return manifests
예제 #8
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
예제 #9
0
def get_paths(fold_path_and_pattern: Tuple[Pathlike, str]) -> List[Path]:
    return check_and_rglob(*fold_path_and_pattern)
예제 #10
0
def prepare_fisher_english(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    absolute_paths: bool = False,
    num_jobs: int = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_dir / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_dir}'.")

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_dir / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(
        audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}"
    if len(transcript_paths) != len(sessions):
        warnings.warn(
            f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, "
            f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}."
        )

    recs_path = output_dir / "recordings_notfixed.jsonl.gz"
    if recs_path.is_file():
        logging.info(f"Using existing recording manifest at {recs_path}")
        recordings = RecordingSet.from_jsonl_lazy(recs_path)
    else:
        logging.info(f"Building fresh recording manifest")
        create_recordings_input = [(p, None if absolute_paths else 5)
                                   for p in audio_paths]
        err_recos = 0
        with ProcessPoolExecutor(
                num_jobs) as executor, RecordingSet.open_writer(
                    recs_path) as writer:
            with tqdm(total=len(create_recordings_input),
                      desc="Collect recordings") as pbar:
                for reco in executor.map(create_recording,
                                         create_recordings_input):
                    if reco is not None:
                        writer.write(reco, flush=True)
                    else:
                        err_recos += 1
                    pbar.update()
        if err_recos:
            warnings.warn(f"Out of {len(create_recordings_input)} recordings, "
                          f"{err_recos} had errors and were omitted.")
        recordings = writer.open_manifest()

    sups_path = output_dir / "supervisions_notfixed.jsonl.gz"
    if sups_path.is_file():
        logging.info(f"Using existing supervision manifest at {recs_path}")
        supervisions = SupervisionSet.from_jsonl_lazy(sups_path)
    else:
        logging.info(f"Building fresh supervision manifest")
        create_supervisions_input = [(sessions, p) for p in transcript_paths]
        err_sups = 0
        with ThreadPoolExecutor(os.cpu_count() *
                                4) as executor, SupervisionSet.open_writer(
                                    sups_path) as writer:
            with tqdm(total=len(create_supervisions_input),
                      desc="Create supervisions") as pbar:
                for tmp_supervisions in executor.map(
                        create_supervision, create_supervisions_input):
                    if not tmp_supervisions:
                        err_sups += 1
                    for s in tmp_supervisions:
                        writer.write(s)
                    pbar.update()
        supervisions = writer.open_manifest()
        if err_recos:
            warnings.warn(
                f"Out of {len(create_supervisions_input)} transcript files, "
                f"{err_sups} had errors and were omitted.")

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    # Write the fixed and validated version to files with standard names.
    recordings.to_file(recs_path.parent / "recordings.jsonl.gz")
    supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
예제 #11
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
예제 #12
0
def prepare_gale_arabic(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Arabic Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable([
                check_and_rglob(dir, ext, strict=False) for dir in audio_dirs
                for ext in ['*.wav', '*.flac']
            ])
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs])
    transcript_paths = [p for p in transcript_paths]

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values())

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths))

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = defaultdict(dict)
    manifests['test'] = {
        'recordings': recordings.filter(lambda r: r.id in TEST),
        'supervisions': supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests['train'] = {
        'recordings': recordings.filter(lambda r: r.id not in TEST),
        'supervisions':
        supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "test"]:
            manifests[part]["recordings"].to_json(output_dir /
                                                  f'recordings_{part}.json')
            manifests[part]["supervisions"].to_json(
                output_dir / f'supervisions_{part}.json')

    return manifests
예제 #13
0
def prepare_fisher_spanish(
    audio_dir_path: Pathlike,
    transcript_dir_path: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:

    """
    Prepares manifests for Fisher Spanish.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param audio_dir_path: Path to audio directory (usually LDC2010S01).
    :param transcript_dir_path: Path to transcript directory (usually LDC2010T04).
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path(
        transcript_dir_path
    )

    audio_paths = check_and_rglob(audio_dir_path, "*.sph")
    transcript_paths = check_and_rglob(transcript_dir_path, "*.tdf")

    sessions_data_path = check_and_rglob(transcript_dir_path, "*_call.tbl")[0]
    with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
        session_lines = [
            l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
        ][1:]
        sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines}

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(audio_paths), desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                executor.map(create_recording, create_recordings_input)
            ):
                recordings[i] = reco
                pbar.update()
    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(
            total=len(create_supervisions_input), desc="Create supervisions"
        ) as pbar:
            for i, tmp_supervisions in enumerate(
                executor.map(create_supervision, create_supervisions_input)
            ):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions).filter(
        lambda s: s.duration > 0.0
    )

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
예제 #14
0
def prepare_mgb2(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    text_cleaning: bool = True,
    buck_walter: bool = False,
    num_jobs: int = 1,
    mer_thresh: int = 80,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe).
    :param buck_walter: Bool, use BuckWalter transliteration
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :param mer_thresh: int, filter out segments based on mer (Match Error Rate)
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `text_cleaning` option removes all punctuation and diacritics.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    dataset_parts = ["dev", "train", "test"]
    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(
            dataset_parts=dataset_parts,
            output_dir=output_dir,
            prefix="mgb2",
            suffix="jsonl.gz",
            lazy=True,
        )

    for part in dataset_parts:
        info(f"Processing MGB2 subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz"
        ):
            info(f"MGB2 subset: {part} already prepared - skipping.")
            continue

        # Read the recordings and write them into manifest. We additionally store the
        # duration of the recordings in a dict which will be used later to create the
        # supervisions.

        output_dir = Path(output_dir)
        corpus_dir = Path(corpus_dir)
        if part == "test" or part == "dev":
            (output_dir / part).mkdir(parents=True, exist_ok=True)
            copy(
                corpus_dir / part / "text.non_overlap_speech",
                output_dir / part / "text",
            )
            copy(
                corpus_dir / part / "segments.non_overlap_speech",
                output_dir / part / "segments",
            )
            with open(corpus_dir / part / "wav.scp", "r") as f_in, open(
                output_dir / part / "wav.scp", "w"
            ) as f_out:
                for line in f_in:
                    f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/"))
                    f_out.write("\n")

            recordings, supervisions, _ = load_kaldi_data_dir(
                (output_dir / part), 16000
            )
            if buck_walter is False:
                supervisions = supervisions.transform_text(from_buck_walter)
            if part == "test":
                assert (
                    len(supervisions) == 5365
                ), f"Expected 5365 supervisions for test, found {len(supervisions)}"
            elif part == "dev":
                assert (
                    len(supervisions) == 5002
                ), f"Expected 5002 supervisions for dev, found {len(supervisions)}"
        elif part == "train":
            recordings = RecordingSet.from_dir(
                (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs
            )

            xml_paths = check_and_rglob(
                path.join(corpus_dir, part, "xml/utf8"), "*.xml"
            )
            # Read supervisions and write them to manifest
            with recursion_limit(5000):
                supervisions_list = list(
                    chain.from_iterable(
                        [make_supervisions(p, mer_thresh) for p in xml_paths]
                    )
                )

            supervisions = SupervisionSet.from_segments(supervisions_list)

            assert (
                len(supervisions) == 375103
            ), f"Expected 375103 supervisions for train, found {len(supervisions)}"

            if text_cleaning is True:
                supervisions = supervisions.transform_text(cleaning)
            recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        # saving recordings and supervisions
        recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz"))
        supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz"))

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }
    return manifests
예제 #15
0
def prepare_fisher_english(
    corpus_path: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_path = Path(corpus_path)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_path / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_path}'."
            )

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_path / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_path / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_path / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 5)
                               for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_recordings_input),
                  desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                    executor.map(create_recording, create_recordings_input)):
                recordings[i] = reco
                pbar.update()

    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_supervisions_input),
                  desc="Create supervisions") as pbar:
            for i, tmp_supervisions in enumerate(
                    executor.map(create_supervision,
                                 create_supervisions_input)):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions)

    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "recordings.jsonl.gz")
        supervisions.to_file(output_dir / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
예제 #16
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ['train', 'devtest', 'evaltest']:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / 'callhome/arabic' /
            split.replace('evaltest', 'evltest'),
            '*.sph')
        recordings = RecordingSet.from_recordings(
            make_recording_callhome(p, sph2pipe_path=sph2pipe_path)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt')

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(id=f'{recording_id}_{idx}',
                                       recording_id=recording_id,
                                       start=start,
                                       duration=duration,
                                       speaker=f'{recording_id}_{spk}',
                                       text=text))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = remove_missing_recordings_and_supervisions(
            recordings, supervisions)
        supervisions = trim_supervisions_to_recordings(recordings,
                                                       supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f'recordings_{split}.json')
            supervisions.to_json(output_dir / f'supervisions_{split}.json')

        manifests[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests