def test_sequential_jsonl_writer_overwrite(overwrite): cuts = DummyManifest(CutSet, begin_id=0, end_id=100) half = cuts.split(num_splits=2)[0] with NamedTemporaryFile(suffix='.jsonl') as jsonl_f: # Store the first half half.to_file(jsonl_f.name) # Open sequential writer with CutSet.open_writer(jsonl_f.name, overwrite=overwrite) as writer: if overwrite: assert all(not writer.contains(id_) for id_ in half.ids) else: assert all(writer.contains(id_) for id_ in half.ids)
def prepare_gigaspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike], dataset_parts: Union[str, Sequence[str]] = "auto", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) gigaspeech = GigaSpeech(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe some manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz", lazy=True, ) for part in subsets: logging.info(f"Processing GigaSpeech subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz" ): logging.info(f"GigaSpeech subset: {part} already prepared - skipping.") continue with RecordingSet.open_writer( output_dir / f"gigaspeech_recordings_{part}.jsonl.gz" ) as rec_writer, SupervisionSet.open_writer( output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz" ) as sup_writer, CutSet.open_writer( output_dir / f"gigaspeech_cuts_{part}.jsonl.gz" ) as cut_writer: for recording, segments in tqdm( parallel_map( parse_utterance, gigaspeech.audios("{" + part + "}"), repeat(gigaspeech.gigaspeech_dataset_dir), num_jobs=num_jobs, ), desc="Processing GigaSpeech JSON entries", ): # Fix and validate the recording + supervisions recordings, segments = fix_manifests( recordings=RecordingSet.from_recordings([recording]), supervisions=SupervisionSet.from_segments(segments), ) validate_recordings_and_supervisions( recordings=recordings, supervisions=segments ) # Create the cut since most users will need it anyway. # There will be exactly one cut since there's exactly one recording. cuts = CutSet.from_manifests( recordings=recordings, supervisions=segments ) # Write the manifests rec_writer.write(recordings[0]) for s in segments: sup_writer.write(s) cut_writer.write(cuts[0]) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), "cuts": CutSet.from_jsonl_lazy(cut_writer.path), } return dict(manifests)