Python CutSet.open_writer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lhotse

클래스/타입: CutSet

메소드/함수: open_writer

hotexamples.com에서의 예제들: 2

Python CutSet.open_writer - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lhotse.CutSet.open_writer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

from_json(30)

from_cuts(30)

from_file(19)

from_manifests(15)

from_jsonl_lazy(5)

from_webdataset(4)

CutSet(3)

from_jsonl(2)

from_yaml(2)

mix(2)

open_writer(2)

pad(2)

sort_by_duration(2)

compute_global_feature_stats(1)

from_dicts(1)

from_arrow(1)

mux(1)

예제 #1

파일 보기

파일: test_serialization.py 프로젝트: jimbozhang/lhotse

def test_sequential_jsonl_writer_overwrite(overwrite):
    cuts = DummyManifest(CutSet, begin_id=0, end_id=100)
    half = cuts.split(num_splits=2)[0]
    with NamedTemporaryFile(suffix='.jsonl') as jsonl_f:
        # Store the first half
        half.to_file(jsonl_f.name)

        # Open sequential writer
        with CutSet.open_writer(jsonl_f.name, overwrite=overwrite) as writer:
            if overwrite:
                assert all(not writer.contains(id_) for id_ in half.ids)
            else:
                assert all(writer.contains(id_) for id_ in half.ids)

예제 #2

파일 보기

def prepare_gigaspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike],
    dataset_parts: Union[str, Sequence[str]] = "auto",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available("speechcolab"):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab"
        )

    subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts
    if isinstance(subsets, str):
        subsets = [subsets]
    corpus_dir = Path(corpus_dir)
    gigaspeech = GigaSpeech(corpus_dir)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe some manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="gigaspeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in subsets:
        logging.info(f"Processing GigaSpeech subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz"
        ):
            logging.info(f"GigaSpeech subset: {part} already prepared - skipping.")
            continue

        with RecordingSet.open_writer(
            output_dir / f"gigaspeech_recordings_{part}.jsonl.gz"
        ) as rec_writer, SupervisionSet.open_writer(
            output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, CutSet.open_writer(
            output_dir / f"gigaspeech_cuts_{part}.jsonl.gz"
        ) as cut_writer:
            for recording, segments in tqdm(
                parallel_map(
                    parse_utterance,
                    gigaspeech.audios("{" + part + "}"),
                    repeat(gigaspeech.gigaspeech_dataset_dir),
                    num_jobs=num_jobs,
                ),
                desc="Processing GigaSpeech JSON entries",
            ):
                # Fix and validate the recording + supervisions
                recordings, segments = fix_manifests(
                    recordings=RecordingSet.from_recordings([recording]),
                    supervisions=SupervisionSet.from_segments(segments),
                )
                validate_recordings_and_supervisions(
                    recordings=recordings, supervisions=segments
                )
                # Create the cut since most users will need it anyway.
                # There will be exactly one cut since there's exactly one recording.
                cuts = CutSet.from_manifests(
                    recordings=recordings, supervisions=segments
                )
                # Write the manifests
                rec_writer.write(recordings[0])
                for s in segments:
                    sup_writer.write(s)
                cut_writer.write(cuts[0])

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
            "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
        }

    return dict(manifests)