def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]: cuts = self.test_cuts() is_list = isinstance(cuts, list) test_loaders = [] if not is_list: cuts = [cuts] for cuts_test in cuts: logging.debug("About to create test dataset") test = K2SpeechRecognitionDataset( input_strategy=( OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) if self.args.on_the_fly_feats else PrecomputedFeatures()), return_cuts=True, ) sampler = SingleCutSampler(cuts_test, max_duration=self.args.max_duration) logging.debug("About to create test dataloader") test_dl = DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) test_loaders.append(test_dl) if is_list: return test_loaders else: return test_loaders[0]
def test_k2_speech_recognition_on_the_fly_feature_extraction( k2_cut_set, use_batch_extract, fault_tolerant): precomputed_dataset = K2SpeechRecognitionDataset() on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=40)), use_batch_extract=use_batch_extract, fault_tolerant=fault_tolerant, )) sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc["inputs"]) norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"]) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc["supervisions"]["start_frame"] == batch_otf["supervisions"]["start_frame"]).all() assert (batch_pc["supervisions"]["num_frames"] == batch_otf["supervisions"]["num_frames"]).all()
def valid_dataloaders(self) -> DataLoader: logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() logging.info("About to create dev dataset") if self.args.on_the_fly_feats: cuts_valid = cuts_valid.drop_features() validate = K2SpeechRecognitionDataset( cuts_valid.drop_features(), input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=80)))) else: validate = K2SpeechRecognitionDataset(cuts_valid) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, ) logging.info("About to create dev dataloader") valid_dl = DataLoader( validate, sampler=valid_sampler, batch_size=None, num_workers=2, persistent_workers=True, ) return valid_dl
def main(): args = get_parser().parse_args() dataset_parts = ('dev', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French')) musan_dir = locate_corpus(Path('/mnt/corpora/musan')) output_dir = Path('exp/data') print('mls manifest preparation:') mls_manifests = prepare_mls(corpus_dir=corpus_dir, output_dir=output_dir, opus=False, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in mls_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) mls_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def test_extract_and_store_features(cut): extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: cut_with_feats = cut.compute_and_store_features(extractor=extractor, storage=storage) arr = cut_with_feats.load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly): mixed_cut = cut.append(cut) extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: cut_with_feats = mixed_cut.compute_and_store_features( extractor=extractor, storage=storage, mix_eagerly=mix_eagerly) arr = cut_with_feats.load_features() assert arr.shape[0] == 200 assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def _with_features( self, cut: MonoCut, frame_shift: Seconds, sampling_rate: int ) -> MonoCut: d = TemporaryDirectory() self.dirs.append(d) extractor = Fbank( config=FbankConfig(sampling_rate=sampling_rate, frame_shift=frame_shift) ) with LilcomHdf5Writer(d.name) as storage: return cut.compute_and_store_features(extractor, storage=storage)
def test_feature_set_builder(storage_fn): recordings: RecordingSet = RecordingSet.from_json( "test/fixtures/audio.json") extractor = Fbank(FbankConfig(sampling_rate=8000)) with storage_fn() as storage: builder = FeatureSetBuilder( feature_extractor=extractor, storage=storage, ) feature_set = builder.process_and_store_recordings( recordings=recordings) assert len(feature_set) == 6 feature_infos = list(feature_set) # Assert the properties shared by all features for features in feature_infos: # assert that fbank is the default feature type assert features.type == "kaldi-fbank" # assert that duration is always a multiple of frame_shift assert features.num_frames == round(features.duration / features.frame_shift) # assert that num_features is preserved assert features.num_features == builder.feature_extractor.config.num_filters # assert that the storage type metadata matches assert features.storage_type == storage.name # assert that the metadata is consistent with the data shapes arr = features.load() assert arr.shape[0] == features.num_frames assert arr.shape[1] == features.num_features # assert that the stored features are the same as the "freshly extracted" features recording = recordings[features.recording_id] expected = extractor.extract( samples=recording.load_audio(channels=features.channels), sampling_rate=recording.sampling_rate, ) np.testing.assert_almost_equal(arr, expected, decimal=2) # Assert the properties for recordings of duration 0.5 seconds for features in feature_infos[:2]: assert features.num_frames == 50 assert features.duration == 0.5 # Assert the properties for recordings of duration 1.0 seconds for features in feature_infos[2:]: assert features.num_frames == 100 assert features.duration == 1.0
def valid_dataloaders(self) -> DataLoader: self.validate_args() logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() transforms = [] if self.args.concatenate_cuts: transforms = [ CutConcatenate(duration_factor=self.args.duration_factor, gap=self.args.gap) ] + transforms logging.info("About to create dev dataset") if self.args.on_the_fly_feats: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)), num_workers=8), return_cuts=self.args.return_cuts, ) else: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, return_cuts=self.args.return_cuts, ) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, ) logging.info("About to create dev dataloader") # valid_dl = DataLoader( # validate, # sampler=valid_sampler, # batch_size=None, # num_workers=8, # persistent_workers=True, # ) valid_dl = LhotseDataLoader( validate, sampler=valid_sampler, num_workers=2, ) return valid_dl
def __init__( self, lang_dir: Pathlike, scripted_model_path: Optional[Pathlike] = None, model_dir: Optional[Pathlike] = None, average_epochs: Sequence[int] = (7, 8, 9), device: torch.device = 'cpu', sampling_rate: int = 16000, ): if isinstance(device, str): self.device = torch.device(device) self.sampling_rate = sampling_rate self.extractor = Fbank(FbankConfig(num_mel_bins=80)) self.lexicon = Lexicon(lang_dir) phone_ids = self.lexicon.phone_symbols() self.P = create_bigram_phone_lm(phone_ids) if model_dir is not None: # Read model from regular checkpoints, assume it's a Conformer self.model = Conformer(num_features=80, num_classes=len(phone_ids) + 1, num_decoder_layers=0) self.P.scores = torch.zeros_like(self.P.scores) self.model.P_scores = torch.nn.Parameter(self.P.scores.clone(), requires_grad=False) average_checkpoint(filenames=[ model_dir / f'epoch-{n}.pt' for n in average_epochs ], model=self.model) elif scripted_model_path is not None: # Read model from a serialized TorchScript module, no assumptions needed self.model = torch.jit.load(scripted_model_path) else: raise ValueError( "One of scripted_model_path or model_dir needs to be provided." ) # Freeze the params by default. for p in self.model.parameters(): p.requires_grad_(False) self.compiler = MmiTrainingGraphCompiler(lexicon=self.lexicon, device=self.device) self.HLG = k2.Fsa.from_dict(torch.load(lang_dir / 'HLG.pt')).to( self.device)
def main(): args = get_parser().parse_args() dataset_parts = ('devtest', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/mnt/corpora/LDC2006S37/data'), ) output_dir = Path('exp/data') print('Heroico manifest preparation:') transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' ) heroico_manifests = prepare_heroico( speech_dir=corpus_dir, transcript_dir=transcripts_dir, output_dir=output_dir, ) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in heroico_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions'] ) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer ) heroico_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def main(): args = get_parser().parse_args() corpus_dir = locate_corpus( Path("/export/corpora5/AMI/amicorpus"), ) annotations_dir = Path("/export/c07/draj") download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm") output_dir = Path("exp/data") print("AMI manifest preparation:") ami_manifests = prepare_ami( corpus_dir, annotations_dir=annotations_dir, output_dir=output_dir, mic="sdm", partition="full-corpus", max_pause=0, ) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in ami_manifests.items(): if (output_dir / f"cuts_{partition}.json.gz").is_file(): print(f"{partition} already exists - skipping.") continue print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ).cut_into_windows(duration=5) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)), executor=ex, storage_type=LilcomHdf5Writer, ).pad(duration=5.0) cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
def test_on_the_fly_feature_extraction_unsupervised_dataset(libri_cut_set): ref_dataset = UnsupervisedDataset() tested_dataset = DynamicUnsupervisedDataset(feature_extractor=Fbank( FbankConfig(num_mel_bins=40)), ) out = ref_dataset[libri_cut_set] ref_feats = out["features"] tested_feats = tested_dataset[libri_cut_set] # Note: comparison to 1 decimal fails. # I'm assuming this is due to lilcom's compression. # Pytest outputs looks like the following: # E Mismatched elements: 4 / 23000 (0.0174%) # E Max absolute difference: 0.46469784 # E Max relative difference: 0.6171043 # E x: array([[-11.5, -11.4, -9.9, ..., -5.5, -6.5, -7.4], # E [-13.2, -11.2, -9.6, ..., -5.6, -6.5, -7.6], # E [-12. , -10.1, -10.1, ..., -5.8, -7. , -7.8],... # E y: array([[-11.5, -11.4, -9.9, ..., -5.5, -6.5, -7.4], # E [-13.2, -11.2, -9.6, ..., -5.6, -6.5, -7.6], # E [-12. , -10.1, -10.1, ..., -5.8, -7. , -7.8],... np.testing.assert_array_almost_equal(ref_feats, tested_feats, decimal=0)
def test_extract_and_store_features_from_cut_set(cut_set, executor, num_jobs, storage_type, mix_eagerly): extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir: cut_set_with_feats = cut_set.compute_and_store_features( extractor=extractor, storage_path=tmpdir, num_jobs=num_jobs, mix_eagerly=mix_eagerly, executor=executor() if executor else None, storage_type=storage_type, ).sort_by_duration( ) # sort by duration to ensure the same order of cuts # The same number of cuts assert len(cut_set_with_feats) == 2 for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats): # The ID is retained assert orig_cut.id == feat_cut.id # Features were attached assert feat_cut.has_features # Recording is retained unless mixing a MixedCut eagerly should_have_recording = not (mix_eagerly and isinstance(orig_cut, MixedCut)) assert feat_cut.has_recording == should_have_recording cuts = list(cut_set_with_feats) arr = cuts[0].load_features() assert arr.shape[0] == 300 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate) arr = cuts[1].load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
frame_length=frame_length, frame_shift=frame_shift) == expected_num_frames) def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected @pytest.mark.parametrize( ["feature_extractor", "decimal", "exception_expectation"], [ (Fbank(FbankConfig(num_filters=40, sampling_rate=8000)), 0, does_not_raise()), (Spectrogram(), -1, does_not_raise()), (Mfcc(MfccConfig(sampling_rate=8000)), None, raises(ValueError)), ], ) def test_mixer(feature_extractor, decimal, exception_expectation): # Treat it more like a test of "it runs" rather than "it works" sr = 8000 t = np.linspace(0, 1, 8000, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) x2 = np.sin(55.0 * t).reshape(1, -1) f1 = feature_extractor.extract(x1, sr) f2 = feature_extractor.extract(x2, sr) with exception_expectation: mixer = FeatureMixer(
def train_dataloaders(self) -> DataLoader: logging.info("About to get train cuts") cuts_train = self.train_cuts() logging.info("About to get Musan cuts") cuts_musan = load_manifest(self.args.feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if self.args.concatenate_cuts: logging.info( f'Using cut concatenation with duration factor ' f'{self.args.duration_factor} and gap {self.args.gap}.') # Cut concatenation should be the first transform in the list, # so that if we e.g. mix noise in, it will fill the gaps between different utterances. transforms = [ CutConcatenate(duration_factor=self.args.duration_factor, gap=self.args.gap) ] + transforms input_transforms = [ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ] train = K2SpeechRecognitionDataset( cut_transforms=transforms, input_transforms=input_transforms, return_cuts=True, ) if self.args.on_the_fly_feats: # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage. # # Add on-the-fly speed perturbation; since originally it would have increased epoch # # size by 3, we will apply prob 2/3 and use 3x more epochs. # # Speed perturbation probably should come first before concatenation, # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized) # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms # Drop feats to be on the safe side. cuts_train = cuts_train.drop_features() train = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=80))), input_transforms=input_transforms, return_cuts=True, ) if self.args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets) else: logging.info('Using SingleCutSampler.') train_sampler = SingleCutSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, ) logging.info("About to create train dataloader") train_dl = DataLoader( train, sampler=train_sampler, batch_size=None, num_workers=4, persistent_workers=True, ) return train_dl
def main(): args = get_parser().parse_args() model_type = args.model_type epoch = args.epoch max_duration = args.max_duration avg = args.avg att_rate = args.att_rate exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) else: model = Conformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) if avg == 1: checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt') load_checkpoint(checkpoint, model) else: checkpoints = [ os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in range(epoch - avg, epoch) ] average_checkpoint(checkpoints, model) model.to(device) model.eval() if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG(L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled HLG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) # load dataset feature_dir = Path('exp/data') test_sets = ['test-clean', 'test-other'] for test_set in test_sets: logging.info(f'* DECODING: {test_set}') logging.debug("About to get test cuts") cuts_test = load_manifest(feature_dir / f'cuts_{test_set}.json.gz') logging.debug("About to create test dataset") from lhotse.dataset.input_strategies import OnTheFlyFeatures from lhotse import Fbank, FbankConfig test = K2SpeechRecognitionDataset( cuts_test, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)))) sampler = SingleCutSampler(cuts_test, max_duration=max_duration) logging.debug("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=symbol_table) recog_path = exp_dir / f'recogs-{test_set}.txt' store_transcripts(path=recog_path, texts=results) logging.info(f'The transcripts are stored in {recog_path}') # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'[{test_set}] %WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def _with_features(self, cut: Cut, frame_shift: Seconds) -> Cut: d = TemporaryDirectory() self.dirs.append(d) extractor = Fbank(config=FbankConfig(frame_shift=frame_shift)) with LilcomFilesWriter(d.name) as storage: return cut.compute_and_store_features(extractor, storage=storage)
def main(): args = get_parser().parse_args() if args.full_libri: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500') else: dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ), Path('/root/fangjun/data/librispeech/LibriSpeech'), Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech')) musan_dir = locate_corpus( Path('/export/corpora5/JHU/musan'), Path('/export/common/data/corpora/MUSAN/musan'), Path('/root/fangjun/data/musan'), ) output_dir = Path('exp/data') print('LibriSpeech manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
frame_length=frame_length, frame_shift=frame_shift) == expected_num_frames) def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected @pytest.mark.parametrize( ["feature_extractor", "decimal", "exception_expectation"], [ (Fbank(FbankConfig(num_mel_bins=40)), 0, does_not_raise()), (Spectrogram(), -1, does_not_raise()), (Mfcc(), None, raises(ValueError)), ], ) def test_mixer(feature_extractor, decimal, exception_expectation): # Treat it more like a test of "it runs" rather than "it works" sr = 8000 t = np.linspace(0, 1, 8000, dtype=np.float32) x1 = np.sin(440.0 * t).reshape(1, -1) x2 = np.sin(55.0 * t).reshape(1, -1) f1 = feature_extractor.extract(x1, sr) f2 = feature_extractor.extract(x2, sr) with exception_expectation: mixer = FeatureMixer(
def test_extract_features(cut): extractor = Fbank(FbankConfig(sampling_rate=8000)) arr = cut.compute_features(extractor=extractor) assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
def main(): args = get_parser().parse_args() dataset_parts = [args.subset, "DEV", "TEST"] print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path("/export/corpora5/gigaspeech"), Path("/exp/pzelasko/gigaspeech"), ) musan_dir = locate_corpus( Path("/export/corpora5/JHU/musan"), Path("/export/common/data/corpora/MUSAN/musan"), Path("/root/fangjun/data/musan"), ) output_dir = Path("exp/data") print("GigaSpeech manifest preparation:") gigaspeech_manifests = prepare_gigaspeech( corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=args.num_jobs, ) print("Musan manifest preparation:") musan_cuts_path = output_dir / "cuts_musan.json.gz" musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=("music", "speech", "noise")) ctx_suffix = get_context_suffix(args) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in gigaspeech_manifests.items(): raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz" cuts_path = (output_dir / f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz") if raw_cuts_path.is_file(): print( f"{partition} already exists - skipping feature extraction." ) else: # Note this step makes the recipe different than LibriSpeech: # We must filter out some utterances and remove punctuation to be consistent with Kaldi. print("Filtering OOV utterances from supervisions") manifests["supervisions"] = manifests["supervisions"].filter( has_no_oov) print("Normalizing text in", partition) for sup in manifests["supervisions"]: sup.text = normalize_text(sup.text) # Create long-recording cut manifests. print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ) # Run data augmentation that needs to be done in the time domain. if partition not in ["DEV", "TEST"]: cut_set = (cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)) cut_set.to_file(raw_cuts_path) if cuts_path.is_file(): print( f"{partition} already exists - skipping cutting into sub-segments." ) else: try: # If we skipped initializing `cut_set` because it exists on disk, we'll load it. # This helps us avoid re-computing the features for different variants of # context windows. cut_set except NameError: print(f"Reading {partition} raw cuts from disk.") cut_set = CutSet.from_file(raw_cuts_path) # Note this step makes the recipe different than LibriSpeech: # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions. # We cut these into smaller chunks centered around each supervision, possibly adding acoustic # context. print( f"About to split {partition} raw cuts into smaller chunks." ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None if args.context_window <= 0.0 else args.context_window, context_direction=args.context_direction, ) if partition in ["L", "XL"]: # Before storing manifests in, we want to pre-shuffle them, # as the sampler won't be able to do it later in an efficient manner. cut_set = cut_set.shuffle() if args.precomputed_features: # Extract the features after cutting large recordings into smaller cuts. # Note: we support very efficient "chunked" feature reads with the argument # `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient # data augmentation and feature computation for long recordings yet. # Therefore, we sacrifice some storage for the ability to precompute # features on shorter chunks, without memory blow-ups. cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path= f"{output_dir}/feats_gigaspeech_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, ) cut_set.to_file(cuts_path) # Remove cut_set so the next iteration can correctly infer whether it needs to # load the raw cuts from disk or not. del cut_set # Now onto Musan if not musan_cuts_path.is_file(): print("Extracting features for Musan") # create chunks of Musan with duration 5 - 10 seconds musan_cuts = (CutSet.from_manifests(recordings=combine( part["recordings"] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_musan", num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer, )) musan_cuts.to_file(musan_cuts_path)