def eval(self, dev_loader): self.model.eval() eval_loss = 0 for step, (_, batch) in enumerate(dev_loader): if self.ngpu > 0: batch = map_to_cuda(batch) loss = self.model(**batch) eval_loss += loss.item() return eval_loss / (step + 1)
def train_one_epoch(self, epoch, train_loader): self.model.train() batch_steps = len(train_loader) step_loss = AverageMeter() auxiliary_loss = AuxiliaryLossAverageMeter() span = 0 for step, (_, inputs, targets) in enumerate(train_loader): if self.ngpu > 0: inputs = map_to_cuda(inputs) targets = map_to_cuda(targets) start = time.time() loss, aux_loss = self.model(inputs, targets) loss = flow.mean(loss) / self.accum_steps loss.backward() end = time.time() span += end - start if self.get_rank() == 0: step_loss.update(loss.item() * self.accum_steps, inputs["inputs"].size(0)) auxiliary_loss.update(aux_loss, self.accum_steps, inputs["inputs"].size(0)) if self.global_training_step % self.accum_steps == 0: if self.local_rank == 0: self.mean_loss.update(step_loss.avg) grad_norm = flow.nn.utils.clip_grad_norm_( self.model.parameters(), self.grad_clip, error_if_nonfinite=False) if self.grad_noise > 0.0: for p in self.model.parameters(): if p.requires_grad: noise = flow.tensor( np.random.normal( 0, self.grad_noise, p.grad.shape, ), device=loss.device, ) p.grad += noise / self.accum_steps if math.isnan(grad_norm.numpy()): logging.warning("Grad norm is NAN. DO NOT UPDATE MODEL!") else: self.scheduler.step() self.optimizer.step() self.optimizer.zero_grad() if (self.scheduler.global_step % self.log_interval == 0 and self.local_rank == 0): process = (step + 1) / batch_steps * 100 print_info = ( "-Training-Epoch-%d(%.5f%%), Global Step:%d, lr:%.8f, Loss:%.5f, AvgLoss: %.5f, Run Time:%.3f" % ( epoch, process, self.scheduler.global_step, self.scheduler.lr, step_loss.avg, self.mean_loss.mean(), span, )) print_info += auxiliary_loss.avg_infos logger.info(print_info) span = 0 step_loss.reset() auxiliary_loss.reset() self.global_training_step += 1 if self.is_debug and step > 30: break return self.mean_loss.mean()
def main(args): checkpoint = torch.load(args.load_model) if args.config is not None: with open(args.config, 'r') as f: params = yaml.load(f, Loader=yaml.FullLoader) else: params = checkpoint['params'] params['data']['batch_size'] = args.batch_size model_type = params['model']['type'] model = End2EndModel[model_type](params['model']) if 'frontend' in checkpoint: model.frontend.load_state_dict(checkpoint['frontend']) logger.info('[FrontEnd] Load the frontend checkpoint!') model.encoder.load_state_dict(checkpoint['encoder']) logger.info('[Encoder] Load the encoder checkpoint!') if 'decoder' in checkpoint: model.decoder.load_state_dict(checkpoint['decoder']) logger.info('[Decoder] Load the decoder checkpoint!') if 'joint' in checkpoint: model.joint.load_state_dict(checkpoint['joint']) logger.info('[JointNet] Load the joint net of transducer checkpoint!') if 'look_ahead_conv' in checkpoint: model.lookahead_conv.load_state_dict(checkpoint['look_ahead_conv']) logger.info('[LookAheadConvLayer] Load the external lookaheadconvlayer checkpoint!') if 'ctc' in checkpoint: model.assistor.load_state_dict(checkpoint['ctc']) logger.info('[CTC Assistor] Load the ctc assistor checkpoint!') logger.info('Finished! Loaded pre-trained model from %s' % args.load_model) model.eval() if args.ngpu > 0: model.cuda() if args.load_language_model is not None: lm_chkpt = torch.load(args.load_language_model) lm_parms = lm_chkpt['params'] lm_type = lm_parms['model']['type'] lm = LanguageModel[lm_type](lm_parms['model']) lm.load_state_dict(lm_chkpt['model']) logger.info('Load pre-trained language model from %s' % args.load_language_model) lm.eval() if args.ngpu > 0: lm.cuda() else: lm = None lm_type = None data_loader = FeatureLoader(params, args.decode_set, is_eval=True) idx2unit = data_loader.dataset.idx2unit recognizer = build_recognizer(model_type, model, lm, args, idx2unit) totals = len(data_loader.dataset) expdir = os.path.join('egs', params['data']['name'], 'exp', params['train']['save_name']) decoder_folder_name = ['decode'] decoder_folder_name.append(args.decode_set) decoder_folder_name.append(args.mode) if args.mode != 'greedy': decoder_folder_name.append('%d' % args.beam_width) if args.load_language_model is not None: decoder_folder_name.append('%s_%.2f' % (lm_type, args.lm_weight)) if args.ctc_weight > 0.0: decoder_folder_name.append('ctc_weight_%.3f' % args.ctc_weight) if args.ngram_lm is not None: decoder_folder_name.append('ngram_alpha%.2f_beta%.2f' % (args.alpha, args.beta)) if args.apply_rescoring: decoder_folder_name.append('rescore') decoder_folder_name.append('rw_%.2f' % args.rescore_weight) if args.apply_lm_rescoring: decoder_folder_name.append('lm_rescore') decoder_folder_name.append('rw_%.2f' % args.rescore_weight) try: ep = re.search(r'from(\d{1,3})to(\d{1,3})', args.load_model).groups() decoder_folder_name.append('_'.join(list(ep))) except: ep = re.search(r'epoch.(\d{1,3}).pt', args.load_model).groups()[0] decoder_folder_name.append('epoch_%s' % ep) if args.debug: decoder_folder_name.append('debug_%d_samples' % args.num_sample) if args.suffix is not None: decoder_folder_name.append(args.suffix) decode_dir = os.path.join(expdir, '_'.join(decoder_folder_name)) if not os.path.exists(decode_dir): os.makedirs(decode_dir) writer = open(os.path.join(decode_dir, 'predict.txt'), 'w') detail_writer = open(os.path.join(decode_dir, 'predict.log'), 'w') top_n_false_tokens = 0 false_tokens = 0 total_tokens = 0 accu_time = 0 total_frames = 0 for step, (utt_id, inputs, targets) in enumerate(data_loader.loader): if args.ngpu > 0: inputs = map_to_cuda(inputs) enc_inputs = inputs['inputs'] enc_mask = inputs['mask'] if args.batch_size == 1: total_frames += enc_inputs.size(1) st = time.time() preds, scores = recognizer.recognize(enc_inputs, enc_mask) et = time.time() span = et - st accu_time += span truths = targets['targets'] truths_length = targets['targets_length'] for b in range(len(preds)): n = step * args.batch_size + b truth = [idx2unit[i.item()] for i in truths[b][1:truths_length[b]]] if args.piece2word: truth = ''.join(truth).replace('▁', ' ') else: truth = ' '.join(truth) print_info = '[%d / %d ] %s - truth : %s' % (n, totals, utt_id[b], truth) logger.info(print_info) detail_writer.write(print_info+'\n') total_tokens += len(truth.split()) nbest_min_false_tokens = 1e10 for i in range(len(preds[b])): pred = preds[b][i] if args.piece2word: pred = ''.join(preds[b][i].split()).replace('▁', ' ') n_diff = editdistance.eval(truth.split(), pred.split()) if i == 0: false_tokens += n_diff nbest_min_false_tokens = min(nbest_min_false_tokens, n_diff) print_info = '[%d / %d ] %s - pred-%2d (%3.4f) : %s' % (n, totals, utt_id[b], i, float(scores.cpu()[b, i]), pred) logger.info(print_info) detail_writer.write(print_info+'\n') writer.write(utt_id[b] + ' ' + preds[b][0] + '\n') top_n_false_tokens += nbest_min_false_tokens detail_writer.write('\n') if args.debug and (step+1) * args.batch_size >= args.num_sample: break writer.close() detail_writer.close() with open(os.path.join(decode_dir, 'RESULT'), 'w') as w: wer = false_tokens / total_tokens * 100 logger.info('The WER is %.3f.' % wer) topn_wer = top_n_false_tokens / total_tokens * 100 logger.info('The top %d WER is %.3f' % (args.nbest, topn_wer)) w.write('The Model Chkpt: %s \n' % args.load_model) if model_type == 'ctc': w.write('Decode Mode: %s \n' % args.mode) w.write('The WER is %.3f. \n' % wer) if args.batch_size == 1: rtf = accu_time / total_frames * 100 logger.info('The RTF is %.6f' % rtf) w.write('The RTF is %.6f' % rtf)
def train_one_epoch(self, epoch, train_loader): self.model.train() batch_steps = len(train_loader) step_loss = AverageMeter() auxiliary_loss = AuxiliaryLossAverageMeter() span = 0 for step, (_, inputs, targets) in enumerate(train_loader): if self.ngpu > 0: inputs = map_to_cuda(inputs) targets = map_to_cuda(targets) start = time.time() if self.is_mixspeech: batch_size, t_, c_ = inputs["inputs"].shape # [MIXSPEECH] mixspeech works by pairing data. So it there is an odd number of cases, drop the last one. num_coeffs = batch_size // 2 batch_size = num_coeffs * 2 # beta random is a bunch of 0.1s and 0.9s lambda_ = torch.from_numpy(np.random.beta(0.5, 0.5, size=(1))).float() mix_coeffs = lambda_.unsqueeze(1).unsqueeze(1).repeat(num_coeffs, t_, c_) if self.ngpu > 0: mix_coeffs = mix_coeffs.cuda() lambda_ = lambda_.cuda() # MIXSPEECH the inputs: X_MIX new_inputs = dict() # easier on the eyes ii = inputs["inputs"] new_inputs["inputs"] = ii[0:batch_size:2, :, :] * mix_coeffs +\ ii[1:batch_size:2, :, :] * (1 - mix_coeffs) # mask: pick the longer one[11000] and [11100] --> pick [11100] new_inputs["mask"] = torch.max( inputs["mask"][0:batch_size:2, :], inputs["mask"][1:batch_size:2, :] ) # inputs_length: same new_inputs["inputs_length"] = torch.max( inputs["inputs_length"][0:batch_size:2], inputs["inputs_length"][1:batch_size:2] ) # therefore it's a good idea to train short_first # MIXSPEECH the outputs, Y_1 target_1 = dict() target_1["mask"] = targets["mask"][0:batch_size:2, :] target_1["targets_length"] = targets["targets_length"][0:batch_size:2] target_1["targets"] = targets["targets"][0:batch_size:2, :] # MIXSPEECH the outputs, Y_2 target_2 = dict() target_2["mask"] = targets["mask"][1:batch_size:2, :] target_2["targets_length"] = targets["targets_length"][1:batch_size:2] target_2["targets"] = targets["targets"][1:batch_size:2, :] loss_1, aux_loss_1 = self.model(new_inputs, target_1) loss_2, aux_loss_2 = self.model(new_inputs, target_2) loss = lambda_ * loss_1 + (1 - lambda_) * loss_2 # should be a dictionary when there are something... aux_loss = None else: # loss: tensor # axu_loss: dict {loss1: value1, loss2: value2} # self.model.forward_hook(self.scheduler.global_step, self.scheduler.global_epoch) loss, aux_loss = self.model(inputs, targets) loss = torch.mean(loss) / self.accum_steps loss.backward() end = time.time() span += (end - start) if self.get_rank() == 0: step_loss.update(loss.item() * self.accum_steps, inputs['inputs'].size(0)) auxiliary_loss.update(aux_loss, self.accum_steps, inputs['inputs'].size(0)) if self.global_training_step % self.accum_steps == 0: if self.local_rank == 0: self.mean_loss.update(step_loss.avg) grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip) if self.grad_noise > 0.0: for p in self.model.parameters(): if p.requires_grad: noise = torch.normal(0, self.grad_noise, p.grad.shape, device=loss.device) p.grad += noise / self.accum_steps if math.isnan(grad_norm): logging.warning('Grad norm is NAN. DO NOT UPDATE MODEL!') else: self.scheduler.step() self.optimizer.step() self.optimizer.zero_grad() if self.is_visual and self.local_rank == 0: self.visulizer.add_scalar('train_loss', loss.item(), self.scheduler.global_step) self.visulizer.add_scalar('lr', self.scheduler.lr, self.scheduler.global_step) if self.scheduler.global_step % self.log_interval == 0 and self.local_rank == 0: process = (step + 1) / batch_steps * 100 print_info = "-Training-Epoch-%d(%.5f%%), Global Step:%d, lr:%.8f, Loss:%.5f, AvgLoss: %.5f, Run Time:%.3f" \ % (epoch, process, self.scheduler.global_step, self.scheduler.lr, step_loss.avg, self.mean_loss.mean(), span) print_info += auxiliary_loss.avg_infos logger.info(print_info) span = 0 step_loss.reset() auxiliary_loss.reset() self.global_training_step += 1 if self.is_debug and step > 30: break return self.mean_loss.mean()
def train_one_epoch(self, epoch, train_loader): self.model.train() batch_steps = len(train_loader) step_loss = AverageMeter() auxiliary_loss = AuxiliaryLossAverageMeter() span = 0 for step, (_, inputs, targets) in enumerate(train_loader): if self.ngpu > 0: inputs = map_to_cuda(inputs) targets = map_to_cuda(targets) start = time.time() # loss: tensor # axu_loss: dict {loss1: value1, loss2: value2} # self.model.forward_hook(self.scheduler.global_step, self.scheduler.global_epoch) loss, aux_loss = self.model(inputs, targets) loss = torch.mean(loss) / self.accum_steps loss.backward() end = time.time() span += (end - start) if self.get_rank() == 0: step_loss.update(loss.item() * self.accum_steps, inputs['inputs'].size(0)) auxiliary_loss.update(aux_loss, self.accum_steps, inputs['inputs'].size(0)) if self.global_training_step % self.accum_steps == 0: if self.local_rank == 0: self.mean_loss.update(step_loss.avg) grad_norm = torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.grad_clip) if self.grad_noise > 0.0: for p in self.model.parameters(): if p.requires_grad: noise = torch.normal(0, self.grad_noise, p.grad.shape, device=loss.device) p.grad += noise / self.accum_steps if math.isnan(grad_norm): logging.warning('Grad norm is NAN. DO NOT UPDATE MODEL!') else: self.scheduler.step() self.optimizer.step() self.optimizer.zero_grad() if self.is_visual and self.local_rank == 0: self.visulizer.add_scalar('train_loss', loss.item(), self.scheduler.global_step) self.visulizer.add_scalar('lr', self.scheduler.lr, self.scheduler.global_step) if self.scheduler.global_step % self.log_interval == 0 and self.local_rank == 0: process = (step + 1) / batch_steps * 100 print_info = "-Training-Epoch-%d(%.5f%%), Global Step:%d, lr:%.8f, Loss:%.5f, AvgLoss: %.5f, Run Time:%.3f" \ % (epoch, process, self.scheduler.global_step, self.scheduler.lr, step_loss.avg, self.mean_loss.mean(), span) print_info += auxiliary_loss.avg_infos logger.info(print_info) span = 0 step_loss.reset() auxiliary_loss.reset() self.global_training_step += 1 if self.is_debug and step > 30: break return self.mean_loss.mean()
def main(args): checkpoint = os.listdir(args.load_model) if args.config is not None: with open(args.config, "r") as f: params = yaml.load(f, Loader=yaml.FullLoader) else: path = os.path.join(args.load_model, "../" "config.yaml") with open(path, "r") as f: params = yaml.load(f, Loader=yaml.FullLoader) params["data"]["batch_size"] = args.batch_size model_type = params["model"]["type"] model = End2EndModel[model_type](params["model"]) if "frontend.pt" in checkpoint: model.frontend.load_state_dict( flow.load(os.path.join(args.load_model, "frontend.pt"))) logger.info("[FrontEnd] Load the frontend checkpoint!") if "encoder.pt" in checkpoint: model.encoder.load_state_dict( flow.load(os.path.join(args.load_model, "encoder.pt"))) logger.info("[Encoder] Load the encoder checkpoint!") if "decoder.pt" in checkpoint: model.decoder.load_state_dict( flow.load(os.path.join(args.load_model, "decoder.pt"))) logger.info("[Decoder] Load the decoder checkpoint!") if "joint.pt" in checkpoint: model.joint.load_state_dict( flow.load(os.path.join(args.load_model, "joint.pt"))) logger.info("[JointNet] Load the joint net of transducer checkpoint!") if "look_ahead_conv.pt" in checkpoint: model.lookahead_conv.load_state_dict( flow.load(os.path.join(args.load_model, "look_ahead_conv.pt"))) logger.info( "[LookAheadConvLayer] Load the external lookaheadconvlayer checkpoint!" ) if "ctc.pt" in checkpoint: model.assistor.load_state_dict( flow.load(os.path.join(args.load_model, "ctc.pt"))) logger.info("[CTC Assistor] Load the ctc assistor checkpoint!") logger.info("Finished! Loaded pre-trained model from %s" % args.load_model) model.eval() if args.ngpu > 0: model.cuda() if args.load_language_model is not None: lm_chkpt = flow.load(args.load_language_model) lm_parms = lm_chkpt["params"] lm_type = lm_parms["model"]["type"] lm = LanguageModel[lm_type](lm_parms["model"]) lm.load_state_dict(lm_chkpt["model"]) logger.info("Load pre-trained language model from %s" % args.load_language_model) lm.eval() if args.ngpu > 0: lm.cuda() else: lm = None lm_type = None data_loader = FeatureLoader(params, args.decode_set, is_eval=True) idx2unit = data_loader.dataset.idx2unit recognizer = build_recognizer(model_type, model, lm, args, idx2unit) totals = len(data_loader.dataset) expdir = os.path.join("egs", params["data"]["name"], "exp", params["train"]["save_name"]) decoder_folder_name = ["decode"] decoder_folder_name.append(args.decode_set) decoder_folder_name.append(args.mode) if args.mode != "greedy": decoder_folder_name.append("%d" % args.beam_width) if args.load_language_model is not None: decoder_folder_name.append("%s_%.2f" % (lm_type, args.lm_weight)) if args.ctc_weight > 0.0: decoder_folder_name.append("ctc_weight_%.3f" % args.ctc_weight) if args.ngram_lm is not None: decoder_folder_name.append("ngram_alpha%.2f_beta%.2f" % (args.alpha, args.beta)) if args.apply_rescoring: decoder_folder_name.append("rescore") decoder_folder_name.append("rw_%.2f" % args.rescore_weight) if args.apply_lm_rescoring: decoder_folder_name.append("lm_rescore") decoder_folder_name.append("rw_%.2f" % args.rescore_weight) try: ep = re.search(r"from(\d{1,3})to(\d{1,3})", args.load_model).groups() decoder_folder_name.append("_".join(list(ep))) except: ep = re.search(r"epoch.(\d{1,3}).pt", args.load_model).groups()[0] decoder_folder_name.append("epoch_%s" % ep) if args.debug: decoder_folder_name.append("debug_%d_samples" % args.num_sample) if args.suffix is not None: decoder_folder_name.append(args.suffix) decode_dir = os.path.join(expdir, "_".join(decoder_folder_name)) if not os.path.exists(decode_dir): os.makedirs(decode_dir) writer = open(os.path.join(decode_dir, "predict.txt"), "w") detail_writer = open(os.path.join(decode_dir, "predict.log"), "w") top_n_false_tokens = 0 false_tokens = 0 total_tokens = 0 accu_time = 0 total_frames = 0 for step, (utt_id, inputs, targets) in enumerate(data_loader.loader): if args.ngpu > 0: inputs = map_to_cuda(inputs) enc_inputs = inputs["inputs"] enc_mask = inputs["mask"] if args.batch_size == 1: total_frames += enc_inputs.size(1) st = time.time() preds, scores = recognizer.recognize(enc_inputs, enc_mask) et = time.time() span = et - st accu_time += span truths = targets["targets"] truths_length = targets["targets_length"] for b in range(len(preds)): n = step * args.batch_size + b truth = [ idx2unit[i.item()] for i in truths[b][1:truths_length[b].numpy()] ] if args.piece2word: truth = "".join(truth).replace("▁", " ") else: truth = " ".join(truth) print_info = "[%d / %d ] %s - truth : %s" % ( n, totals, utt_id[b], truth, ) logger.info(print_info) detail_writer.write(print_info + "\n") total_tokens += len(truth.split()) nbest_min_false_tokens = 1e10 for i in range(len(preds[b])): pred = preds[b][i] if args.piece2word: pred = "".join(preds[b][i].split()).replace("▁", " ") n_diff = editdistance.eval(truth.split(), pred.split()) if i == 0: false_tokens += n_diff nbest_min_false_tokens = min(nbest_min_false_tokens, n_diff) print_info = "[%d / %d ] %s - pred-%2d (%3.4f) : %s" % ( n, totals, utt_id[b], i, float(scores.cpu()[b, i].numpy()), pred, ) logger.info(print_info) detail_writer.write(print_info + "\n") writer.write(utt_id[b] + " " + preds[b][0] + "\n") top_n_false_tokens += nbest_min_false_tokens detail_writer.write("\n") if args.debug and (step + 1) * args.batch_size >= args.num_sample: break writer.close() detail_writer.close() with open(os.path.join(decode_dir, "RESULT"), "w") as w: wer = false_tokens / total_tokens * 100 logger.info("The WER is %.3f." % wer) topn_wer = top_n_false_tokens / total_tokens * 100 logger.info("The top %d WER is %.3f" % (args.nbest, topn_wer)) w.write("The Model Chkpt: %s \n" % args.load_model) if model_type == "ctc": w.write("Decode Mode: %s \n" % args.mode) w.write("The WER is %.3f. \n" % wer) if args.batch_size == 1: rtf = accu_time / total_frames * 100 logger.info("The RTF is %.6f" % rtf) w.write("The RTF is %.6f" % rtf)