def main(): corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Aishell data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('aishell manifest preparation:') aishell_manifests = prepare_aishell(corpus_dir=corpus_dir, output_dir=output_dir) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan', output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', num_jobs=num_jobs if ex is not None else 80, executor=ex, storage_type=LilcomHdf5Writer) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def main(): args = get_parser().parse_args() dataset_parts = ('dev', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French')) musan_dir = locate_corpus(Path('/mnt/corpora/musan')) output_dir = Path('exp/data') print('mls manifest preparation:') mls_manifests = prepare_mls(corpus_dir=corpus_dir, output_dir=output_dir, opus=False, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in mls_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) mls_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def main(): dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dirs = [ Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ) ] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Librispeech data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('Manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=num_jobs) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), executor=ex, storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}')) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def main(): args = get_parser().parse_args() dataset_parts = ('devtest', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/mnt/corpora/LDC2006S37/data'), ) output_dir = Path('exp/data') print('Heroico manifest preparation:') transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' ) heroico_manifests = prepare_heroico( speech_dir=corpus_dir, transcript_dir=transcripts_dir, output_dir=output_dir, ) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in heroico_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions'] ) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer ) heroico_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def main(): args = get_parser().parse_args() corpus_dir = locate_corpus( Path("/export/corpora5/AMI/amicorpus"), ) annotations_dir = Path("/export/c07/draj") download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm") output_dir = Path("exp/data") print("AMI manifest preparation:") ami_manifests = prepare_ami( corpus_dir, annotations_dir=annotations_dir, output_dir=output_dir, mic="sdm", partition="full-corpus", max_pause=0, ) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in ami_manifests.items(): if (output_dir / f"cuts_{partition}.json.gz").is_file(): print(f"{partition} already exists - skipping.") continue print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ).cut_into_windows(duration=5) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)), executor=ex, storage_type=LilcomHdf5Writer, ).pad(duration=5.0) cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
def main(): corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Aishell data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('Manifest preparation:') aishell_manifests = prepare_aishell(corpus_dir=corpus_dir, output_dir=output_dir) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), executor=ex, storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}')) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def export_to_kaldi(recordings: RecordingSet, supervisions: SupervisionSet, output_dir: Pathlike): """ Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data directory. Currently, it only supports single-channel recordings that have a single ``AudioSource``. The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must be possible to create a ``CutSet`` out of them. :param recordings: a ``RecordingSet`` manifest. :param supervisions: a ``SupervisionSet`` manifest. :param output_dir: path where the Kaldi-style data directory will be created. """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) assert all(len(r.sources) == 1 for r in recordings), "Kaldi export of Recordings with multiple audio sources " \ "is currently not supported." assert all(r.num_channels == 1 for r in recordings), "Kaldi export of multi-channel Recordings is currently " \ "not supported." # Create a simple CutSet that ties together the recording <-> supervision information. cuts = CutSet.from_manifests( recordings=recordings, supervisions=supervisions).trim_to_supervisions() # wav.scp save_kaldi_text_mapping(data={ recording.id: f'{source.source} |' if source.type == 'command' else source.source for recording in recordings for src_idx, source in enumerate(recording.sources) }, path=output_dir / 'wav.scp') # segments save_kaldi_text_mapping(data={ cut.supervisions[0].id: f'{cut.recording_id} {cut.start} {cut.end}' for cut in cuts }, path=output_dir / 'segments') # text save_kaldi_text_mapping(data={ cut.supervisions[0].id: cut.supervisions[0].text for cut in cuts }, path=output_dir / 'text') # utt2spk save_kaldi_text_mapping(data={ cut.supervisions[0].id: cut.supervisions[0].speaker for cut in cuts }, path=output_dir / 'utt2spk') # utt2dur save_kaldi_text_mapping( data={cut.supervisions[0].id: cut.duration for cut in cuts}, path=output_dir / 'utt2dur') # reco2dur save_kaldi_text_mapping( data={recording.id: recording.duration for recording in recordings}, path=output_dir / 'reco2dur') # utt2lang [optional] if all(s.language is not None for s in supervisions): save_kaldi_text_mapping(data={ cut.supervisions[0].id: cut.supervisions[0].language for cut in cuts }, path=output_dir / 'utt2lang') # utt2gender [optional] if all(s.gender is not None for s in supervisions): save_kaldi_text_mapping(data={ cut.supervisions[0].id: cut.supervisions[0].gender for cut in cuts }, path=output_dir / 'utt2gender')
def memmap_raw_audio(wav_scp, f_memmapped, utt_list, dtype=np.float32, sampling_rate=16000, do_normalize=True): ''' Maps the wva.scp file from kaldi to a memory mapped numpy object. This allows for fast i/o when creating window minibathces from slices of training data. input args: wav_scp, f_memmapped output: utt_lens = {'utt_n': # utt_n frames, ...} offsets = {'utt_n': utt_n offset in memory mapped numpy file} data_shape = {#frames, feature_dimension} ''' import os dataset = os.path.dirname(wav_scp) print(dataset) if not os.path.exists(os.path.join(dataset, 'reco2dur')): p = subprocess.Popen(['./utils/data/get_reco2dur.sh', dataset], stdout=subprocess.PIPE) out = p.communicate() # Import lhotse and install if not available try: from lhotse import kaldi, CutSet except ImportError: from pip._internal import main as pip pip(['install', 'lhotse']) from lhotse import kaldi, CutSet from lhotse.utils import compute_num_samples data = kaldi.load_kaldi_data_dir(dataset, sampling_rate) cuts = CutSet.from_manifests(data[0], data[1]) dim = 1 utt_lens = {} for cut in cuts: sr = cut.recording.sampling_rate for sup in cut.supervisions: if sup.id not in utt_list: continue utt_lens[sup.id.encode()] = compute_num_samples(sup.duration, sr) data_shape = (sum(utt_lens.values()), dim) f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape) offsets = {} offset = 0 for cut in cuts: x_ = cut.recording.load_audio().T # Mean and variance normalize if do_normalize: x = (x_ - x_.mean()) / x_.std() else: x = x_ sr = cut.recording.sampling_rate for i, supervision in enumerate(cut.supervisions): k = supervision.id print('Utterance ', i, ' : ', k, ' : ', sr) start, dur = supervision.start, supervision.duration if k not in utt_list: continue start_sample = compute_num_samples(start, sr) end_sample = start_sample + utt_lens[k.encode()] m = x[start_sample:end_sample] offsets[k.encode()] = offset utt_lens[k.encode()] = m.shape[0] new_offset = offset + utt_lens[k.encode()] f[offset:new_offset, :] = m offset = new_offset print() del f return utt_lens, offsets, data_shape
def main(): corpus_dir = locate_corpus( (Path('/mnt/cfs2/asr/database/AM/aishell'), Path('/root/fangjun/data/aishell'), Path( '/home/storage04/zhuangweiji/data/open-source-data/SLR33-aishell/data' )), msg='Please specify the directory to the AIShell dataset') musan_dir = locate_corpus( (Path('/export/corpora5/JHU/musan'), Path('/export/common/data/corpora/MUSAN/musan'), Path('/root/fangjun/data/musan')), msg='Please specify the directory to the MUSAN dataset') output_dir = Path('exp/data') print('aishell manifest preparation:') aishell_manifests = prepare_aishell( corpus_dir=corpus_dir, output_dir=output_dir ) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan( corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise') ) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions'] ) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', num_jobs=num_jobs if ex is not None else 80, executor=ex, storage_type=LilcomHdf5Writer ) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests( recordings=combine(part['recordings'] for part in musan_manifests.values()) ).cut_into_windows(10.0).filter(lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer ) musan_cuts.to_json(musan_cuts_path)
def main(): args = get_parser().parse_args() if args.full_libri: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500') else: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ), Path('/root/fangjun/data/librispeech/LibriSpeech'), Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech')) musan_dir = locate_corpus( Path('/export/corpora5/JHU/musan'), Path('/export/common/data/corpora/MUSAN/musan'), Path('/root/fangjun/data/musan'), ) output_dir = Path('exp/data') print('LibriSpeech manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
if use_data_augmentation: num_jobs = 1 else: num_jobs = os.cpu_count() torch.set_num_threads(1) torch.set_num_interop_threads(1) num_jobs = 1 for partition, manifests in librispeech_manifests.items(): print(partition) with LilcomFilesWriter(f'{output_dir}/feats_{partition}' ) as storage, ProcessPoolExecutor(num_jobs) as ex: cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']).compute_and_store_features( extractor=Fbank(), storage=storage, augmenter=augmenter if 'train' in partition else None, executor=ex) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir + f'/cuts_{partition}.json.gz') cuts_train = SpeechRecognitionDataset( librispeech_manifests['train-clean-100']['cuts']) cuts_test = SpeechRecognitionDataset( librispeech_manifests['test-clean']['cuts']) sample = cuts_train[0] print('Transcript:', sample['text']) print('Supervisions mask:', sample['supervisions_mask']) print('Feature matrix:', sample.load_features())
def main(): if full_libri: dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100', 'train-clean-360', 'train-other-500') else: dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dirs = [ Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ) ] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Librispeech data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('LibriSpeech manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan', output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def export_to_kaldi( recordings: RecordingSet, supervisions: SupervisionSet, output_dir: Pathlike, map_underscores_to: Optional[str] = None, ): """ Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data directory. It even supports recordings that have multiple channels but the recordings will still have to have a single ``AudioSource``. The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must be possible to create a ``CutSet`` out of them. :param recordings: a ``RecordingSet`` manifest. :param supervisions: a ``SupervisionSet`` manifest. :param output_dir: path where the Kaldi-style data directory will be created. :param map_underscores_to: optional string with which we will replace all underscores. This helps avoid issues with Kaldi data dir sorting. """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) assert all(len(r.sources) == 1 for r in recordings), ( "Kaldi export of Recordings with multiple audio sources " "is currently not supported.") if map_underscores_to is not None: supervisions = supervisions.map(lambda s: fastcopy( s, id=s.id.replace("_", map_underscores_to), speaker=s.speaker.replace("_", map_underscores_to), )) # Create a simple CutSet that ties together # the recording <-> supervision information. cuts = CutSet.from_manifests( recordings=recordings, supervisions=supervisions).trim_to_supervisions() if all(r.num_channels == 1 for r in recordings): # if all the recordings are single channel, we won't add # the channel id affix to retain back compatibility # and the ability to receive back the same utterances after # importing the exported directory back # wav.scp save_kaldi_text_mapping( data={ recording.id: make_wavscp_channel_string_map( source, sampling_rate=recording.sampling_rate)[0] for recording in recordings for source in recording.sources }, path=output_dir / "wav.scp", ) # segments save_kaldi_text_mapping( data={ cut.supervisions[0].id: f"{cut.recording_id} {cut.start} {cut.end}" for cut in cuts }, path=output_dir / "segments", ) # reco2dur save_kaldi_text_mapping( data={ recording.id: recording.duration for recording in recordings }, path=output_dir / "reco2dur", ) else: # wav.scp save_kaldi_text_mapping( data={ f"{recording.id}_{channel}": make_wavscp_channel_string_map( source, sampling_rate=recording.sampling_rate)[channel] for recording in recordings for source in recording.sources for channel in source.channels }, path=output_dir / "wav.scp", ) # segments save_kaldi_text_mapping( data={ cut.supervisions[0].id: f"{cut.recording_id}_{cut.channel} {cut.start} {cut.end}" for cut in cuts }, path=output_dir / "segments", ) # reco2dur save_kaldi_text_mapping( data={ f"{recording.id}_{channel}": recording.duration for recording in recordings for channel in recording.sources[0].channels }, path=output_dir / "reco2dur", ) # text save_kaldi_text_mapping( data={ cut.supervisions[0].id: cut.supervisions[0].text for cut in cuts }, path=output_dir / "text", ) # utt2spk save_kaldi_text_mapping( data={ cut.supervisions[0].id: cut.supervisions[0].speaker for cut in cuts }, path=output_dir / "utt2spk", ) # utt2dur save_kaldi_text_mapping( data={cut.supervisions[0].id: cut.duration for cut in cuts}, path=output_dir / "utt2dur", ) # utt2lang [optional] if all(s.language is not None for s in supervisions): save_kaldi_text_mapping( data={ cut.supervisions[0].id: cut.supervisions[0].language for cut in cuts }, path=output_dir / "utt2lang", ) # utt2gender [optional] if all(s.gender is not None for s in supervisions): save_kaldi_text_mapping( data={ cut.supervisions[0].id: cut.supervisions[0].gender for cut in cuts }, path=output_dir / "utt2gender", )
def prepare_gigaspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike], dataset_parts: Union[str, Sequence[str]] = "auto", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) gigaspeech = GigaSpeech(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe some manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz", lazy=True, ) for part in subsets: logging.info(f"Processing GigaSpeech subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz" ): logging.info(f"GigaSpeech subset: {part} already prepared - skipping.") continue with RecordingSet.open_writer( output_dir / f"gigaspeech_recordings_{part}.jsonl.gz" ) as rec_writer, SupervisionSet.open_writer( output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz" ) as sup_writer, CutSet.open_writer( output_dir / f"gigaspeech_cuts_{part}.jsonl.gz" ) as cut_writer: for recording, segments in tqdm( parallel_map( parse_utterance, gigaspeech.audios("{" + part + "}"), repeat(gigaspeech.gigaspeech_dataset_dir), num_jobs=num_jobs, ), desc="Processing GigaSpeech JSON entries", ): # Fix and validate the recording + supervisions recordings, segments = fix_manifests( recordings=RecordingSet.from_recordings([recording]), supervisions=SupervisionSet.from_segments(segments), ) validate_recordings_and_supervisions( recordings=recordings, supervisions=segments ) # Create the cut since most users will need it anyway. # There will be exactly one cut since there's exactly one recording. cuts = CutSet.from_manifests( recordings=recordings, supervisions=segments ) # Write the manifests rec_writer.write(recordings[0]) for s in segments: sup_writer.write(s) cut_writer.write(cuts[0]) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), "cuts": CutSet.from_jsonl_lazy(cut_writer.path), } return dict(manifests)
def main(): args = get_parser().parse_args() dataset_parts = [args.subset, "DEV", "TEST"] print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path("/export/corpora5/gigaspeech"), Path("/exp/pzelasko/gigaspeech"), ) musan_dir = locate_corpus( Path("/export/corpora5/JHU/musan"), Path("/export/common/data/corpora/MUSAN/musan"), Path("/root/fangjun/data/musan"), ) output_dir = Path("exp/data") print("GigaSpeech manifest preparation:") gigaspeech_manifests = prepare_gigaspeech( corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs, ) print("Musan manifest preparation:") musan_cuts_path = output_dir / "cuts_musan.json.gz" musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=("music", "speech", "noise")) ctx_suffix = get_context_suffix(args) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in gigaspeech_manifests.items(): raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz" cuts_path = (output_dir / f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz") if raw_cuts_path.is_file(): print( f"{partition} already exists - skipping feature extraction." ) else: # Note this step makes the recipe different than LibriSpeech: # We must filter out some utterances and remove punctuation to be consistent with Kaldi. print("Filtering OOV utterances from supervisions") manifests["supervisions"] = manifests["supervisions"].filter( has_no_oov) print("Normalizing text in", partition) for sup in manifests["supervisions"]: sup.text = normalize_text(sup.text) # Create long-recording cut manifests. print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ) # Run data augmentation that needs to be done in the time domain. if partition not in ["DEV", "TEST"]: cut_set = (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) cut_set.to_file(raw_cuts_path) if cuts_path.is_file(): print( f"{partition} already exists - skipping cutting into sub-segments." ) else: try: # If we skipped initializing `cut_set` because it exists on disk, we'll load it. # This helps us avoid re-computing the features for different variants of # context windows. cut_set except NameError: print(f"Reading {partition} raw cuts from disk.") cut_set = CutSet.from_file(raw_cuts_path) # Note this step makes the recipe different than LibriSpeech: # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions. # We cut these into smaller chunks centered around each supervision, possibly adding acoustic # context. print( f"About to split {partition} raw cuts into smaller chunks." ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None if args.context_window <= 0.0 else args.context_window, context_direction=args.context_direction, ) if partition in ["L", "XL"]: # Before storing manifests in, we want to pre-shuffle them, # as the sampler won't be able to do it later in an efficient manner. cut_set = cut_set.shuffle() if args.precomputed_features: # Extract the features after cutting large recordings into smaller cuts. # Note: we support very efficient "chunked" feature reads with the argument # `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient # data augmentation and feature computation for long recordings yet. # Therefore, we sacrifice some storage for the ability to precompute # features on shorter chunks, without memory blow-ups. cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path= f"{output_dir}/feats_gigaspeech_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, ) cut_set.to_file(cuts_path) # Remove cut_set so the next iteration can correctly infer whether it needs to # load the raw cuts from disk or not. del cut_set # Now onto Musan if not musan_cuts_path.is_file(): print("Extracting features for Musan") # create chunks of Musan with duration 5 - 10 seconds musan_cuts = (CutSet.from_manifests(recordings=combine( part["recordings"] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_musan", num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer, )) musan_cuts.to_file(musan_cuts_path)