def main(): corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Aishell data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('aishell manifest preparation:') aishell_manifests = prepare_aishell(corpus_dir=corpus_dir, output_dir=output_dir) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan', output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', num_jobs=num_jobs if ex is not None else 80, executor=ex, storage_type=LilcomHdf5Writer) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def main(): args = get_parser().parse_args() dataset_parts = ('dev', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French')) musan_dir = locate_corpus(Path('/mnt/corpora/musan')) output_dir = Path('exp/data') print('mls manifest preparation:') mls_manifests = prepare_mls(corpus_dir=corpus_dir, output_dir=output_dir, opus=False, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in mls_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) mls_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def test_lazy_cuts_combine_split_issue(): # Test for lack of exception cuts = DummyManifest(CutSet, begin_id=0, end_id=1000) with TemporaryDirectory() as d, NamedTemporaryFile(suffix=".jsonl.gz") as f: cuts.to_file(f.name) f.flush() cuts_lazy = load_manifest_lazy(f.name) cuts_lazy = combine(cuts_lazy, cuts_lazy.perturb_speed(0.9)) cuts_lazy.split_lazy(d, chunk_size=100)
def test_combine_lazy(manifest_type): expected = DummyManifest(manifest_type, begin_id=0, end_id=200) with as_lazy(DummyManifest( manifest_type, begin_id=0, end_id=68)) as part1, as_lazy( DummyManifest(manifest_type, begin_id=68, end_id=136)) as part2, as_lazy( DummyManifest(manifest_type, begin_id=136, end_id=200)) as part3: combined = combine(part1, part2, part3) # Equivalent under iteration assert list(combined) == list(expected)
def main(): corpus_dir = locate_corpus( (Path('/mnt/cfs2/asr/database/AM/aishell'), Path('/root/fangjun/data/aishell'), Path( '/home/storage04/zhuangweiji/data/open-source-data/SLR33-aishell/data' )), msg='Please specify the directory to the AIShell dataset') musan_dir = locate_corpus( (Path('/export/corpora5/JHU/musan'), Path('/export/common/data/corpora/MUSAN/musan'), Path('/root/fangjun/data/musan')), msg='Please specify the directory to the MUSAN dataset') output_dir = Path('exp/data') print('aishell manifest preparation:') aishell_manifests = prepare_aishell( corpus_dir=corpus_dir, output_dir=output_dir ) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan( corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise') ) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions'] ) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', num_jobs=num_jobs if ex is not None else 80, executor=ex, storage_type=LilcomHdf5Writer ) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests( recordings=combine(part['recordings'] for part in musan_manifests.values()) ).cut_into_windows(10.0).filter(lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer ) musan_cuts.to_json(musan_cuts_path)
def main(): args = get_parser().parse_args() if args.full_libri: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500') else: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ), Path('/root/fangjun/data/librispeech/LibriSpeech'), Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech')) musan_dir = locate_corpus( Path('/export/corpora5/JHU/musan'), Path('/export/common/data/corpora/MUSAN/musan'), Path('/root/fangjun/data/musan'), ) output_dir = Path('exp/data') print('LibriSpeech manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def main(): if full_libri: dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100', 'train-clean-360', 'train-other-500') else: dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dirs = [ Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ) ] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Librispeech data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('LibriSpeech manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan', output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def main(): args = get_parser().parse_args() dataset_parts = [args.subset, "DEV", "TEST"] print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path("/export/corpora5/gigaspeech"), Path("/exp/pzelasko/gigaspeech"), ) musan_dir = locate_corpus( Path("/export/corpora5/JHU/musan"), Path("/export/common/data/corpora/MUSAN/musan"), Path("/root/fangjun/data/musan"), ) output_dir = Path("exp/data") print("GigaSpeech manifest preparation:") gigaspeech_manifests = prepare_gigaspeech( corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs, ) print("Musan manifest preparation:") musan_cuts_path = output_dir / "cuts_musan.json.gz" musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=("music", "speech", "noise")) ctx_suffix = get_context_suffix(args) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in gigaspeech_manifests.items(): raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz" cuts_path = (output_dir / f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz") if raw_cuts_path.is_file(): print( f"{partition} already exists - skipping feature extraction." ) else: # Note this step makes the recipe different than LibriSpeech: # We must filter out some utterances and remove punctuation to be consistent with Kaldi. print("Filtering OOV utterances from supervisions") manifests["supervisions"] = manifests["supervisions"].filter( has_no_oov) print("Normalizing text in", partition) for sup in manifests["supervisions"]: sup.text = normalize_text(sup.text) # Create long-recording cut manifests. print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ) # Run data augmentation that needs to be done in the time domain. if partition not in ["DEV", "TEST"]: cut_set = (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) cut_set.to_file(raw_cuts_path) if cuts_path.is_file(): print( f"{partition} already exists - skipping cutting into sub-segments." ) else: try: # If we skipped initializing `cut_set` because it exists on disk, we'll load it. # This helps us avoid re-computing the features for different variants of # context windows. cut_set except NameError: print(f"Reading {partition} raw cuts from disk.") cut_set = CutSet.from_file(raw_cuts_path) # Note this step makes the recipe different than LibriSpeech: # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions. # We cut these into smaller chunks centered around each supervision, possibly adding acoustic # context. print( f"About to split {partition} raw cuts into smaller chunks." ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None if args.context_window <= 0.0 else args.context_window, context_direction=args.context_direction, ) if partition in ["L", "XL"]: # Before storing manifests in, we want to pre-shuffle them, # as the sampler won't be able to do it later in an efficient manner. cut_set = cut_set.shuffle() if args.precomputed_features: # Extract the features after cutting large recordings into smaller cuts. # Note: we support very efficient "chunked" feature reads with the argument # `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient # data augmentation and feature computation for long recordings yet. # Therefore, we sacrifice some storage for the ability to precompute # features on shorter chunks, without memory blow-ups. cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path= f"{output_dir}/feats_gigaspeech_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, ) cut_set.to_file(cuts_path) # Remove cut_set so the next iteration can correctly infer whether it needs to # load the raw cuts from disk or not. del cut_set # Now onto Musan if not musan_cuts_path.is_file(): print("Extracting features for Musan") # create chunks of Musan with duration 5 - 10 seconds musan_cuts = (CutSet.from_manifests(recordings=combine( part["recordings"] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_musan", num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer, )) musan_cuts.to_file(musan_cuts_path)