def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]: cuts = self.test_cuts() is_list = isinstance(cuts, list) test_loaders = [] if not is_list: cuts = [cuts] for cuts_test in cuts: logging.debug("About to create test dataset") test = K2SpeechRecognitionDataset( input_strategy=( OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) if self.args.on_the_fly_feats else PrecomputedFeatures()), return_cuts=True, ) sampler = SingleCutSampler(cuts_test, max_duration=self.args.max_duration) logging.debug("About to create test dataloader") test_dl = DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) test_loaders.append(test_dl) if is_list: return test_loaders else: return test_loaders[0]
def valid_dataloaders(self) -> DataLoader: logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() logging.info("About to create dev dataset") if self.args.on_the_fly_feats: cuts_valid = cuts_valid.drop_features() validate = K2SpeechRecognitionDataset( cuts_valid.drop_features(), input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=80)))) else: validate = K2SpeechRecognitionDataset(cuts_valid) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, ) logging.info("About to create dev dataloader") valid_dl = DataLoader( validate, sampler=valid_sampler, batch_size=None, num_workers=2, persistent_workers=True, ) return valid_dl
def test_e2e_restore_with_dataloader(num_workers): dset = DummyDataset() # Expecting 10 batches in total. sampler = SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True) sampler.set_epoch(1) # Note: not testing with num_workers > 1 as it will randomize the order of batches. dloader = DataLoader(dset, batch_size=None, sampler=sampler, num_workers=num_workers) expected_batches = [] for idx, b in enumerate(dloader): if idx == 4: # Save the training loop state at step 4 to resume from later. state = dloader.sampler.state_dict() if idx > 4: # Continue iterating to see what batches should be sampled after restoring. expected_batches.append(b) # Restore the sampler to its state from the dloader. restored_sampler = SingleCutSampler(CUTS) restored_sampler.load_state_dict(state) # Initialize a new dloader with the restored sampler. restored_dloader = DataLoader(dset, batch_size=None, sampler=restored_sampler, num_workers=num_workers) batches = [] for b in restored_dloader: batches.append(b) # Check that the results are the same. assert len(expected_batches) == 5 if num_workers == 0: assert len(batches) == 5 assert batches == expected_batches else: # We "lost" 2 batches due to prefetching (i.e., the sampler's state was ahead by 2 batches # and we cannot recover from it for now) assert len(batches) == 3 assert batches == expected_batches[2:]
def valid_dataloaders(self) -> DataLoader: self.validate_args() logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() transforms = [] if self.args.concatenate_cuts: transforms = [ CutConcatenate(duration_factor=self.args.duration_factor, gap=self.args.gap) ] + transforms logging.info("About to create dev dataset") if self.args.on_the_fly_feats: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)), num_workers=8), return_cuts=self.args.return_cuts, ) else: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, return_cuts=self.args.return_cuts, ) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, ) logging.info("About to create dev dataloader") # valid_dl = DataLoader( # validate, # sampler=valid_sampler, # batch_size=None, # num_workers=8, # persistent_workers=True, # ) valid_dl = LhotseDataLoader( validate, sampler=valid_sampler, num_workers=2, ) return valid_dl
CutPairsSampler, DynamicBucketingSampler, SingleCutSampler, ZipSampler, ) from lhotse.dataset.sampling.dynamic import DynamicCutSampler from lhotse.testing.dummies import DummyManifest CUTS = DummyManifest(CutSet, begin_id=0, end_id=100) CUTS_MOD = CUTS.modify_ids(lambda cid: cid + "_alt") # fmt: off SAMPLERS_TO_TEST = [ # Identically initialized SingleCutSampler ( SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), ), # Differently initialized SingleCutSampler with the same CUTS ( SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS), ), # Differently initialized CutPairsSampler with the same CUTS ( CutPairsSampler(CUTS, CUTS, max_source_duration=10.0, shuffle=True,
def main(): args = get_parser().parse_args() exp_dir = Path('exp-lstm-adam-mmi-bigram-musan-dist') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') lexicon = Lexicon(lang_dir) phone_ids = lexicon.phone_symbols() phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') model = TdnnLstm1b( num_features=80, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) checkpoint = os.path.join(exp_dir, f'epoch-{args.epoch}.pt') load_checkpoint(checkpoint, model) model.to(device) model.eval() if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol(lexicon.phones) first_word_disambig_id = find_first_disambig_symbol(lexicon.words) HLG = compile_HLG(L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled LG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path('exp/data') logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') logging.info("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) sampler = SingleCutSampler(cuts_test, max_frames=40000) logging.info("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=lexicon.words) test_set = 'test-clean' recog_path = exp_dir / f'recogs-{test_set}.txt' store_transcripts(path=recog_path, texts=results) logging.info(f'The transcripts are stored in {recog_path}') # The following prints out WERs, per-word error statistics and aligned # ref/hyp pairs. errs_filename = exp_dir / f'errs-{test_set}.txt' with open(errs_filename, 'w') as f: wer = write_error_stats(f, f'{test_set}', results) logging.info(f'The error stats are stored in {errs_filename}')
def main(): fix_random_seed(42) start_epoch = 0 num_epochs = 8 exp_dir = 'exp-lstm-adam-ctc-musan' setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = CtcTrainingGraphCompiler( L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table ) phone_ids = get_phone_symbols(phone_symbol_table) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train-clean-100.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev-clean.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionDataset( cuts_train, cut_transforms=[ CutConcatenate(), CutMix( cuts=cuts_musan, prob=0.5, snr=(10, 20) ) ] ) train_sampler = SingleCutSampler( cuts_train, max_frames=90000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader( train, sampler=train_sampler, batch_size=None, num_workers=4 ) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader( validate, sampler=valid_sampler, batch_size=None, num_workers=1 ) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.to(device) describe(model) learning_rate = 1e-3 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=5e-4) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info(f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}") for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = 1e-3 # curr_learning_rate = learning_rate * pow(0.4, epoch) # for param_group in optimizer.param_groups: # param_group['lr'] = curr_learning_rate tb_writer.add_scalar('learning_rate', curr_learning_rate, epoch) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch(dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, optimizer=None, scheduler=None, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')
def train_dataloaders(self) -> DataLoader: logging.info("About to get train cuts") cuts_train = self.train_cuts() logging.info("About to get Musan cuts") cuts_musan = load_manifest(self.args.feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if self.args.concatenate_cuts: logging.info( f'Using cut concatenation with duration factor ' f'{self.args.duration_factor} and gap {self.args.gap}.') # Cut concatenation should be the first transform in the list, # so that if we e.g. mix noise in, it will fill the gaps between different utterances. transforms = [ CutConcatenate(duration_factor=self.args.duration_factor, gap=self.args.gap) ] + transforms input_transforms = [ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ] train = K2SpeechRecognitionDataset( cut_transforms=transforms, input_transforms=input_transforms, return_cuts=True, ) if self.args.on_the_fly_feats: # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage. # # Add on-the-fly speed perturbation; since originally it would have increased epoch # # size by 3, we will apply prob 2/3 and use 3x more epochs. # # Speed perturbation probably should come first before concatenation, # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized) # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms # Drop feats to be on the safe side. cuts_train = cuts_train.drop_features() train = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=80))), input_transforms=input_transforms, return_cuts=True, ) if self.args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets) else: logging.info('Using SingleCutSampler.') train_sampler = SingleCutSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, ) logging.info("About to create train dataloader") train_dl = DataLoader( train, sampler=train_sampler, batch_size=None, num_workers=4, persistent_workers=True, ) return train_dl
def main(): exp_dir = Path("exp-lstm-adam-ctc-musan") setup_logger("{}/log/log-decode".format(exp_dir), log_level="debug") # load L, G, symbol_table lang_dir = Path("data/lang_nosp") symbol_table = k2.SymbolTable.from_file(lang_dir / "words.txt") phone_symbol_table = k2.SymbolTable.from_file(lang_dir / "phones.txt") phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) if not os.path.exists(lang_dir / "HLG.pt"): print("Loading L_disambig.fst.txt") with open(lang_dir / "L_disambig.fst.txt") as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) print("Loading G.fst.txt") with open(lang_dir / "G.fst.txt") as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG( L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id, ) torch.save(HLG.as_dict(), lang_dir / "HLG.pt") else: print("Loading pre-compiled HLG") d = torch.load(lang_dir / "HLG.pt") HLG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path("exp/data") print("About to get test cuts") cuts_test = CutSet.from_file(feature_dir / "gigaspeech_cuts_TEST.jsonl.gz") print("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) sampler = SingleCutSampler(cuts_test, max_frames=100000) print("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) print("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device("cuda") model = TdnnLstm1b( num_features=80, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, ) checkpoint = os.path.join(exp_dir, "epoch-7.pt") load_checkpoint(checkpoint, model) model.to(device) model.eval() print("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) print("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=symbol_table) s = "" for ref, hyp in results: s += f"ref={ref}\n" s += f"hyp={hyp}\n" logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ["sub", "ins", "del", "total"] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def main(): parser = get_parser() args = parser.parse_args() model_type = args.model_type epoch = args.epoch avg = args.avg att_rate = args.att_rate num_paths = args.num_paths use_lm_rescoring = args.use_lm_rescoring use_whole_lattice = False if use_lm_rescoring and num_paths < 1: # It doesn't make sense to use n-best list for rescoring # when n is less than 1 use_whole_lattice = True output_beam_size = args.output_beam_size exp_dir = Path('exp-' + model_type + '-mmi-att-sa-vgg-normlayer') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') logging.info(f'output_beam_size: {output_beam_size}') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=40, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers, vgg_frontend=args.vgg_fronted) elif model_type == "conformer": model = Conformer( num_features=40, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers, vgg_frontend=args.vgg_frontend, is_espnet_structure=args.is_espnet_structure) elif model_type == "contextnet": model = ContextNet(num_features=40, num_classes=len(phone_ids) + 1) # +1 for the blank symbol else: raise NotImplementedError("Model of type " + str(model_type) + " is not implemented") if avg == 1: checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt') load_checkpoint(checkpoint, model) else: checkpoints = [ os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in range(epoch - avg, epoch) ] average_checkpoint(checkpoints, model) model.to(device) model.eval() if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG(L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled HLG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) logging.debug('Decoding without LM rescoring') G = None if num_paths > 1: logging.debug(f'Use n-best list decoding, n is {num_paths}') else: logging.debug('Use 1-best decoding') logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) if not hasattr(HLG, 'lm_scores'): HLG.lm_scores = HLG.scores.clone() # load dataset feature_dir = Path('exp/data') logging.info("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test.json.gz') logging.info("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) test_sampler = SingleCutSampler(cuts_test, max_frames=12000) logging.info("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, sampler=test_sampler, batch_size=None, num_workers=1) logging.info("About to decode") results = decode(dataloader=test_dl, model=model, HLG=HLG, symbols=symbol_table, num_paths=num_paths, G=G, use_whole_lattice=use_whole_lattice, output_beam_size=output_beam_size) s = '' results2 = [] for ref, hyp in results: s += f'ref={ref}\n' s += f'hyp={hyp}\n' results2.append((list(''.join(ref)), list(''.join(hyp)))) logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] dists2 = [edit_distance(r, h) for r, h in results2] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } errors2 = { key: sum(dist[key] for dist in dists2) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) total_chars = sum(len(ref) for ref, _ in results2) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' ) logging.info( f'%CER {errors2["total"] / total_chars:.2%} ' f'[{errors2["total"]} / {total_chars}, {errors2["ins"]} ins, {errors2["del"]} del, {errors2["sub"]} sub ]' )
def main(): if not torch.cuda.is_available(): logging.error("No GPU detected!") sys.exit(-1) device_id = 0 device = torch.device("cuda", device_id) # Reserve the GPU with a dummy variable reserve_variable = torch.ones(1).to(device) exp_dir = Path("exp-tl1a-adam-xent") setup_logger("{}/log/log-decode".format(exp_dir), log_level="debug") if not os.path.exists(exp_dir / "HCLG.pt"): logging.info("Preparing decoding graph") # sym_str = """ # <eps> 0 # silence 1 # speech 2 # """ # symbol_table = k2.SymbolTable.from_str(sym_str) HCLG = prepare_decoding_graph( min_silence_duration=0.03, min_speech_duration=0.3, max_speech_duration=10.0, ) # Arc sort the HCLG since it is needed for intersect logging.info("Sorting decoding graph by outgoing arcs") HCLG = k2.arc_sort(HCLG) # HCLG.symbols = symbol_table torch.save(HCLG.as_dict(), exp_dir / "HCLG.pt") else: logging.info("Loading pre-compiled decoding graph") d = torch.load(exp_dir / "HCLG.pt") HCLG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path("exp/data") logging.info("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / "cuts_test.json.gz") logging.info("About to create test dataset") test = K2VadDataset(cuts_test, return_cuts=True) sampler = SingleCutSampler(cuts_test, max_frames=100000) logging.info("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) logging.info("About to load model") model = TdnnLstm1a( num_features=80, num_classes=2, # speech/silence subsampling_factor=1, ) checkpoint = os.path.join(exp_dir, "best_model.pt") load_checkpoint(checkpoint, model) model.to(device) model.eval() logging.info("convert decoding graph to device") HCLG = HCLG.to(device) HCLG.requires_grad_(False) logging.info("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HCLG=HCLG) # Compute frame-level accuracy and precision-recall metrics y_true = [] y_pred = [] for result in results: cut, ref, hyp = result y_true.append(ref) y_pred.append(hyp) y_true = torch.cat(y_true, dim=0).numpy() y_pred = torch.cat(y_pred, dim=0).numpy() logging.info("Results: \n{}".format( classification_report(y_true, y_pred, target_names=["silence", "speech"]))) # Create output segments per recording create_and_write_segments( [result[0] for result in results], # cuts [result[2] for result in results], # outputs exp_dir / "segments", # segments file )
def main(): args = get_parser().parse_args() print('World size:', args.world_size, 'Rank:', args.local_rank) setup_dist(rank=args.local_rank, world_size=args.world_size, master_port=args.master_port) fix_random_seed(42) start_epoch = 0 num_epochs = 10 use_adam = True exp_dir = f'exp-lstm-adam-mmi-bigram-musan' setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') lexicon = Lexicon(lang_dir) device_id = args.local_rank device = torch.device('cuda', device_id) phone_ids = lexicon.phone_symbols() if not Path(lang_dir / 'P.pt').is_file(): logging.debug(f'Loading P from {lang_dir}/P.fst.txt') with open(lang_dir / 'P.fst.txt') as f: # P is not an acceptor because there is # a back-off state, whose incoming arcs # have label #0 and aux_label eps. P = k2.Fsa.from_openfst(f.read(), acceptor=False) phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) # P.aux_labels is not needed in later computations, so # remove it here. del P.aux_labels # CAUTION(fangjun): The following line is crucial. # Arcs entering the back-off state have label equal to #0. # We have to change it to 0 here. P.labels[P.labels >= first_phone_disambig_id] = 0 P = k2.remove_epsilon(P) P = k2.arc_sort(P) torch.save(P.as_dict(), lang_dir / 'P.pt') else: logging.debug('Loading pre-compiled P') d = torch.load(lang_dir / 'P.pt') P = k2.Fsa.from_dict(d) graph_compiler = MmiTrainingGraphCompiler( lexicon=lexicon, P=P, device=device, ) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=[ CutConcatenate(), CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20)) ]) train_sampler = SingleCutSampler( cuts_train, max_frames=40000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=12000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.P_scores = nn.Parameter(P.scores.clone(), requires_grad=True) model.to(device) describe(model) if use_adam: learning_rate = 1e-3 weight_decay = 5e-4 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Equivalent to the following in the epoch loop: # if epoch > 6: # curr_learning_rate *= 0.8 lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda ep: 1.0 if ep < 7 else 0.8**(ep - 6)) else: learning_rate = 5e-5 weight_decay = 1e-5 momentum = 0.9 lr_schedule_gamma = 0.7 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=lr_schedule_gamma) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. curr_learning_rate = lr_scheduler.get_last_lr()[0] tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) lr_scheduler.step() # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, optimizer=None, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): args = get_parser().parse_args() start_epoch = args.start_epoch num_epochs = args.num_epochs max_frames = args.max_frames accum_grad = args.accum_grad den_scale = args.den_scale att_rate = args.att_rate fix_random_seed(42) exp_dir = Path('exp-transformer-noam-mmi-att-musan') setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = MmiTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) P.scores = torch.zeros_like(P.scores) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train-clean-100.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev-clean.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if not args.bucketing_sampler: # We don't mix concatenating the cuts and bucketing # Here we insert concatenation before mixing so that the # noises from Musan are mixed onto almost-zero-energy # padding frames. transforms = [CutConcatenate()] + transforms train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=transforms) if args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler(cuts_train, max_frames=max_frames, shuffle=True, num_buckets=args.num_buckets) else: logging.info('Using regular sampler with cut concatenation.') train_sampler = SingleCutSampler( cuts_train, max_frames=max_frames, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=max_frames) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 model = Transformer( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) model.P_scores = nn.Parameter(P.scores.clone(), requires_grad=True) model.to(device) describe(model) optimizer = Noam(model.parameters(), model_size=256, factor=1.0, warm_step=args.warm_step) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = optimizer._rate tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, P=P, device=device, graph_compiler=graph_compiler, optimizer=optimizer, accum_grad=accum_grad, den_scale=den_scale, att_rate=att_rate, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, optimizer=None, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, optimizer=optimizer, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): fix_random_seed(42) if not torch.cuda.is_available(): logging.error("No GPU detected!") sys.exit(-1) device_id = 0 device = torch.device("cuda", device_id) # Reserve the GPU with a dummy variable reserve_variable = torch.ones(1).to(device) start_epoch = 0 num_epochs = 100 exp_dir = "exp-tl1a-adam-xent" setup_logger("{}/log/log-train".format(exp_dir)) tb_writer = SummaryWriter(log_dir=f"{exp_dir}/tensorboard") # load dataset feature_dir = Path("exp/data") logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / "cuts_train.json.gz") logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / "cuts_dev.json.gz") logging.info("About to create train dataset") train = K2VadDataset(cuts_train) train_sampler = SingleCutSampler( cuts_train, max_frames=90000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2VadDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) logging.info("About to create model") model = TdnnLstm1a( num_features=80, num_classes=2, # speech/silence subsampling_factor=1, ) model.to(device) describe(model) learning_rate = 1e-4 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=5e-4) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, "best_model.pt") best_epoch_info_filename = os.path.join(exp_dir, "best-epoch-info") global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, "epoch-{}.pt".format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt["objf"] best_valid_objf = ckpt["valid_objf"] global_batch_idx_train = ckpt["global_batch_idx_train"] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = learning_rate tb_writer.add_scalar("learning_rate", curr_learning_rate, epoch) logging.info("epoch {}, learning rate {}".format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint( filename=best_model_path, model=model, epoch=epoch, optimizer=None, scheduler=None, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, ) save_training_info( filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, ) # we always save the model for every epoch model_path = os.path.join(exp_dir, "epoch-{}.pt".format(epoch)) save_checkpoint( filename=model_path, model=model, optimizer=optimizer, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, ) epoch_info_filename = os.path.join(exp_dir, "epoch-{}-info".format(epoch)) save_training_info( filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, ) logging.warning("Done")
def main(): args = get_parser().parse_args() print('World size:', args.world_size, 'Rank:', args.local_rank) setup_dist(rank=args.local_rank, world_size=args.world_size) fix_random_seed(42) start_epoch = 0 num_epochs = 10 use_adam = True exp_dir = f'exp-lstm-adam-mmi-bigram-musan-dist' setup_logger('{}/log/log-train'.format(exp_dir), use_console=args.local_rank == 0) tb_writer = SummaryWriter( log_dir=f'{exp_dir}/tensorboard') if args.local_rank == 0 else None # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = MmiTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) P.scores = torch.zeros_like(P.scores) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train-clean-100.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev-clean.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if not args.bucketing_sampler: # We don't mix concatenating the cuts and bucketing # Here we insert concatenation before mixing so that the # noises from Musan are mixed onto almost-zero-energy # padding frames. transforms = [CutConcatenate()] + transforms train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=transforms) if args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler(cuts_train, max_frames=40000, shuffle=True, num_buckets=30) else: logging.info('Using regular sampler with cut concatenation.') train_sampler = SingleCutSampler( cuts_train, max_frames=30000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) # Note: we explicitly set world_size to 1 to disable the auto-detection of # distributed training inside the sampler. This way, every GPU will # perform the computation on the full dev set. It is a bit wasteful, # but unfortunately loss aggregation between multiple processes with # torch.distributed.all_reduce() tends to hang indefinitely inside # NCCL after ~3000 steps. With the current approach, we can still report # the loss on the full validation set. valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000, world_size=1, rank=0) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = args.local_rank device = torch.device('cuda', device_id) model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.P_scores = nn.Parameter(P.scores.clone(), requires_grad=True) model.to(device) describe(model) if use_adam: learning_rate = 1e-3 weight_decay = 5e-4 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Equivalent to the following in the epoch loop: # if epoch > 6: # curr_learning_rate *= 0.8 lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda ep: 1.0 if ep < 7 else 0.8**(ep - 6)) else: learning_rate = 5e-5 weight_decay = 1e-5 momentum = 0.9 lr_schedule_gamma = 0.7 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=lr_schedule_gamma) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) if args.world_size > 1: logging.info( 'Using DistributedDataParallel in training. ' 'The reported loss, num_frames, etc. for training steps include ' 'only the batches seen in the master process (the actual loss ' 'includes batches from all GPUs, and the actual num_frames is ' f'approx. {args.world_size}x larger.') # For now do not sync BatchNorm across GPUs due to NCCL hanging in all_gather... # model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. curr_learning_rate = lr_scheduler.get_last_lr()[0] if tb_writer is not None: tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, P=P, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) lr_scheduler.step() # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, optimizer=None, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, local_rank=args.local_rank, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, local_rank=args.local_rank, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done') cleanup_dist()
DynamicBucketingSampler, RoundRobinSampler, SingleCutSampler, ZipSampler, ) from lhotse.dataset.sampling.dynamic import DynamicCutSampler from lhotse.testing.dummies import DummyManifest CUTS = DummyManifest(CutSet, begin_id=0, end_id=100) CUTS_MOD = CUTS.modify_ids(lambda cid: cid + "_alt") # fmt: off SAMPLERS_TO_TEST = [ # Identically initialized SingleCutSampler lambda: ( SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), ), # Differently initialized SingleCutSampler with the same CUTS lambda: ( SingleCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS), ), # Differently initialized CutPairsSampler with the same CUTS lambda: ( CutPairsSampler(CUTS, CUTS, max_source_duration=10.0, shuffle=True, drop_last=True), CutPairsSampler(CUTS, CUTS), ), # Differently initialized ZipSampler with the same CUTS lambda: ( ZipSampler( # CUTS_SHUF just to randomize the order of the zipped-in cutset
def main(): args = get_parser().parse_args() model_type = args.model_type epoch = args.epoch max_duration = args.max_duration avg = args.avg att_rate = args.att_rate exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) else: model = Conformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) if avg == 1: checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt') load_checkpoint(checkpoint, model) else: checkpoints = [ os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in range(epoch - avg, epoch) ] average_checkpoint(checkpoints, model) model.to(device) model.eval() if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG(L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled HLG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) # load dataset feature_dir = Path('exp/data') test_sets = ['test-clean', 'test-other'] for test_set in test_sets: logging.info(f'* DECODING: {test_set}') logging.debug("About to get test cuts") cuts_test = load_manifest(feature_dir / f'cuts_{test_set}.json.gz') logging.debug("About to create test dataset") from lhotse.dataset.input_strategies import OnTheFlyFeatures from lhotse import Fbank, FbankConfig test = K2SpeechRecognitionDataset( cuts_test, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)))) sampler = SingleCutSampler(cuts_test, max_duration=max_duration) logging.debug("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=symbol_table) recog_path = exp_dir / f'recogs-{test_set}.txt' store_transcripts(path=recog_path, texts=results) logging.info(f'The transcripts are stored in {recog_path}') # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'[{test_set}] %WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def test_lhotse_dataloader_runs(cuts): sampler = SingleCutSampler(cuts, max_cuts=2) dloader = LhotseDataLoader(UnsupervisedDataset(), sampler, num_workers=2) batches = list(dloader) assert len(batches) == 50
def main(): args = get_parser().parse_args() epoch = args.epoch max_frames = args.max_frames avg = args.avg att_rate = args.att_rate # exp_dir = Path('/export/gpudisk2/data/hegc/audio_workspace/snowfall_aishell1/exp-transformer-noam-mmi-att-musan') exp_dir = Path('exp-transformer-noam-mmi-att-musan') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 model = Transformer( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) model.P_scores = torch.nn.Parameter(P.scores.clone(), requires_grad=False) if avg == 1: checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt') load_checkpoint(checkpoint, model) else: checkpoints = [os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt') for avg_epoch in range(epoch - avg, epoch)] average_checkpoint(checkpoints, model) model.to(device) model.eval() assert P.requires_grad is False P.scores = model.P_scores.cpu() print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='model_P_scores.txt') P.set_scores_stochastic_(model.P_scores) print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='P_scores.txt') if not os.path.exists(lang_dir / 'LG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol(phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) LG = compile_LG(L=L, G=G, ctc_topo=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(LG.as_dict(), lang_dir / 'LG.pt') else: logging.debug("Loading pre-compiled LG") d = torch.load(lang_dir / 'LG.pt') LG = k2.Fsa.from_dict(d) # load dataset # feature_dir = Path('/export/gpudisk2/data/hegc/audio_workspace/snowfall_aishell1/exp/data') feature_dir = Path('exp/data') logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test.json.gz') logging.debug("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) sampler = SingleCutSampler(cuts_test, max_frames=max_frames) logging.debug("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) logging.debug("convert LG to device") LG = LG.to(device) LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0) LG.requires_grad_(False) logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, LG=LG, symbols=symbol_table) s = '' results2 = [] for ref, hyp in results: s += f'ref={ref}\n' s += f'hyp={hyp}\n' results2.append((list(''.join(ref)), list(''.join(hyp)))) logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] dists2 = [edit_distance(r, h) for r, h in results2] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } errors2 = { key: sum(dist[key] for dist in dists2) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) total_chars = sum(len(ref) for ref, _ in results2) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' ) logging.info( f'%WER {errors2["total"] / total_chars:.2%} ' f'[{errors2["total"]} / {total_chars}, {errors2["ins"]} ins, {errors2["del"]} del, {errors2["sub"]} sub ]' )
def main(): args = get_parser().parse_args() model_type = args.model_type start_epoch = args.start_epoch num_epochs = args.num_epochs max_duration = args.max_duration accum_grad = args.accum_grad att_rate = args.att_rate fix_random_seed(42) exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa') setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter( log_dir=f'{exp_dir}/tensorboard') if args.tensorboard else None # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = load_manifest(feature_dir / 'cuts_train-clean-100.json.gz') if args.full_libri: cuts_train = ( cuts_train + load_manifest(feature_dir / 'cuts_train-clean-360.json.gz') + load_manifest(feature_dir / 'cuts_train-other-500.json.gz')) logging.info("About to get dev cuts") cuts_dev = (load_manifest(feature_dir / 'cuts_dev-clean.json.gz') + load_manifest(feature_dir / 'cuts_dev-other.json.gz')) logging.info("About to get Musan cuts") cuts_musan = load_manifest(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if args.concatenate_cuts: logging.info( f'Using cut concatenation with duration factor {args.duration_factor} and gap {args.gap}.' ) # Cut concatenation should be the first transform in the list, # so that if we e.g. mix noise in, it will fill the gaps between different utterances. transforms = [ CutConcatenate(duration_factor=args.duration_factor, gap=args.gap) ] + transforms train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=transforms, input_transforms=[ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ]) if args.on_the_fly_feats: # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage. # # Add on-the-fly speed perturbation; since originally it would have increased epoch # # size by 3, we will apply prob 2/3 and use 3x more epochs. # # Speed perturbation probably should come first before concatenation, # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized) # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms # Drop feats to be on the safe side. cuts_train = cuts_train.drop_features() from lhotse.features.fbank import FbankConfig train = K2SpeechRecognitionDataset( cuts=cuts_train, cut_transforms=transforms, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80))), input_transforms=[ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ]) if args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler(cuts_train, max_duration=max_duration, shuffle=True, num_buckets=args.num_buckets) else: logging.info('Using SingleCutSampler.') train_sampler = SingleCutSampler( cuts_train, max_duration=max_duration, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader( train, sampler=train_sampler, batch_size=None, num_workers=4, ) logging.info("About to create dev dataset") if args.on_the_fly_feats: cuts_dev = cuts_dev.drop_features() validate = K2SpeechRecognitionDataset( cuts_dev.drop_features(), input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)))) else: validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler( cuts_dev, max_duration=max_duration, ) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) else: model = Conformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) model.to(device) describe(model) optimizer = Noam(model.parameters(), model_size=args.attention_dim, factor=1.0, warm_step=args.warm_step) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = optimizer._rate if tb_writer is not None: tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, accum_grad=accum_grad, att_rate=att_rate, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, optimizer=None, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, optimizer=optimizer, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): fix_random_seed(42) start_epoch = 0 num_epochs = 10 use_adam = True exp_dir = f'exp-lstm-adam-mmi-bigram-musan' setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = MmiTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) P.scores = torch.zeros_like(P.scores) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=[ CutConcatenate(), CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20)) ]) train_sampler = SingleCutSampler( cuts_train, max_frames=12000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=12000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.P_scores = nn.Parameter(P.scores.clone(), requires_grad=True) model.to(device) describe(model) if use_adam: learning_rate = 1e-3 weight_decay = 5e-4 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Equivalent to the following in the epoch loop: # if epoch > 6: # curr_learning_rate *= 0.8 lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda ep: 1.0 if ep < 7 else 0.8**(ep - 6)) else: learning_rate = 5e-5 weight_decay = 1e-5 momentum = 0.9 lr_schedule_gamma = 0.7 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=lr_schedule_gamma) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. curr_learning_rate = lr_scheduler.get_last_lr()[0] tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, P=P, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) lr_scheduler.step() # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, optimizer=None, scheduler=None, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, optimizer=optimizer, scheduler=lr_scheduler, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): exp_dir = Path('exp-lstm-adam-mmi-mbr-musan') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) logging.debug("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.P_scores = torch.nn.Parameter(P.scores.clone(), requires_grad=False) checkpoint = os.path.join(exp_dir, 'epoch-9.pt') load_checkpoint(checkpoint, model) model.to(device) model.eval() assert P.requires_grad is False P.scores = model.P_scores.cpu() print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='model_P_scores.txt') P.set_scores_stochastic_(model.P_scores) print_transition_probabilities(P, phone_symbol_table, phone_ids, filename='P_scores.txt') if not os.path.exists(lang_dir / 'HLG.pt'): logging.debug("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) logging.debug("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) HLG = compile_HLG(L=L, G=G, H=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(HLG.as_dict(), lang_dir / 'HLG.pt') else: logging.debug("Loading pre-compiled HLG") d = torch.load(lang_dir / 'HLG.pt') HLG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path('exp/data') logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') logging.info("About to create test dataset") test = K2SpeechRecognitionDataset(cuts_test) sampler = SingleCutSampler(cuts_test, max_frames=100000) logging.info("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) logging.debug("convert HLG to device") HLG = HLG.to(device) HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0) HLG.requires_grad_(False) logging.debug("About to decode") results = decode(dataloader=test_dl, model=model, device=device, HLG=HLG, symbols=symbol_table) s = '' for ref, hyp in results: s += f'ref={ref}\n' s += f'hyp={hyp}\n' logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def run(rank, world_size, args): ''' Args: rank: It is a value between 0 and `world_size-1`, which is passed automatically by `mp.spawn()` in :func:`main`. The node with rank 0 is responsible for saving checkpoint. world_size: Number of GPUs for DDP training. args: The return value of get_parser().parse_args() ''' model_type = args.model_type start_epoch = args.start_epoch num_epochs = args.num_epochs accum_grad = args.accum_grad den_scale = args.den_scale att_rate = args.att_rate use_pruned_intersect = args.use_pruned_intersect fix_random_seed(42) if world_size > 1: setup_dist(rank, world_size, args.master_port) exp_dir = Path('exp-' + model_type + '-mmi-att-sa-vgg-normlayer') setup_logger(f'{exp_dir}/log/log-train-{rank}') if args.tensorboard and rank == 0: tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') else: tb_writer = None # tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') if args.tensorboard and rank == 0 else None logging.info("Loading lexicon and symbol tables") lang_dir = Path('data/lang_nosp') lexicon = Lexicon(lang_dir) device_id = rank device = torch.device('cuda', device_id) if not Path(lang_dir / 'P.pt').is_file(): logging.debug(f'Loading P from {lang_dir}/P.fst.txt') with open(lang_dir / 'P.fst.txt') as f: # P is not an acceptor because there is # a back-off state, whose incoming arcs # have label #0 and aux_label eps. P = k2.Fsa.from_openfst(f.read(), acceptor=False) phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) # P.aux_labels is not needed in later computations, so # remove it here. del P.aux_labels # CAUTION(fangjun): The following line is crucial. # Arcs entering the back-off state have label equal to #0. # We have to change it to 0 here. P.labels[P.labels >= first_phone_disambig_id] = 0 P = k2.remove_epsilon(P) P = k2.arc_sort(P) torch.save(P.as_dict(), lang_dir / 'P.pt') else: logging.debug('Loading pre-compiled P') d = torch.load(lang_dir / 'P.pt') P = k2.Fsa.from_dict(d) graph_compiler = MmiTrainingGraphCompiler( lexicon=lexicon, P=P, device=device, ) phone_ids = lexicon.phone_symbols() # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=[ CutConcatenate(), CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20)) ]) train_sampler = SingleCutSampler( cuts_train, max_frames=90000, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, sampler=train_sampler, batch_size=None, num_workers=4) logging.info("About to create dev dataset") validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) if use_pruned_intersect: logging.info('Use pruned intersect for den_lats') else: logging.info("Don't use pruned intersect for den_lats") logging.info("About to create model") if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=40, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers, vgg_frontend=True) elif model_type == "conformer": model = Conformer( num_features=40, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers, vgg_frontend=True, is_espnet_structure=True) elif model_type == "contextnet": model = ContextNet(num_features=40, num_classes=len(phone_ids) + 1) # +1 for the blank symbol else: raise NotImplementedError("Model of type " + str(model_type) + " is not implemented") if args.torchscript: logging.info('Applying TorchScript to model...') model = torch.jit.script(model) model.to(device) describe(model) if world_size > 1: model = DDP(model, device_ids=[rank]) # Now for the alignment model, if any if args.use_ali_model: ali_model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4) ali_model_fname = Path( f'exp-lstm-adam-ctc-musan/epoch-{args.ali_model_epoch}.pt') assert ali_model_fname.is_file(), \ f'ali model filename {ali_model_fname} does not exist!' ali_model.load_state_dict( torch.load(ali_model_fname, map_location='cpu')['state_dict']) ali_model.to(device) ali_model.eval() ali_model.requires_grad_(False) logging.info(f'Use ali_model: {ali_model_fname}') else: ali_model = None logging.info('No ali_model') optimizer = Noam(model.parameters(), model_size=args.attention_dim, factor=args.lr_factor, warm_step=args.warm_step, weight_decay=args.weight_decay) scaler = GradScaler(enabled=args.amp) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer, scaler=scaler) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): #train_dl.sampler.set_epoch(epoch) curr_learning_rate = optimizer._rate if tb_writer is not None: tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, ali_model=ali_model, device=device, graph_compiler=graph_compiler, use_pruned_intersect=use_pruned_intersect, optimizer=optimizer, accum_grad=accum_grad, den_scale=den_scale, att_rate=att_rate, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, world_size=world_size, scaler=scaler) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, optimizer=None, scheduler=None, scaler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, local_rank=rank, torchscript=args.torchscript_epoch != -1 and epoch >= args.torchscript_epoch) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, local_rank=rank) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, optimizer=optimizer, scheduler=None, scaler=scaler, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train, local_rank=rank, torchscript=args.torchscript_epoch != -1 and epoch >= args.torchscript_epoch) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch, local_rank=rank) logging.warning('Done') if world_size > 1: torch.distributed.barrier() cleanup_dist()