def eval(epoch): pbar = tqdm(total=len(devset)) losses = [] is_new_epoch = 0 step = 0 while True: batch, is_new_epoch = devset.next() if is_new_epoch: break xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens'] xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs] xs = [np2tensor(x).float() for x in xs] xlen = torch.IntTensor([len(x) for x in xs]) xs = pad_list(xs, 0.0).cuda() _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys] ys_out_pad = pad_list(_ys, 0).long().cuda() ylen = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32)) model.eval() loss = model(xs, ys_out_pad, xlen, ylen) loss = float(loss.data) * len(xlen) losses.append(loss) step += 1 # //TODO vishay un-hardcode the batch size pbar.update(len(batch['xs'])) pbar.close() # Reset data counters devset.reset() return sum(losses) / len(devset) #, wer, cer
def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state streaming (bool): streaming encoding Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.) # SpecAugment if self.use_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all/ys*/ys_sub1*/ys_sub2* streaming (bool): streaming encoding lookback (bool): truncate leftmost frames for lookback in CNN context lookahead (bool): truncate rightmost frames for lookahead in CNN context Returns: eout_dict (dict): """ if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.) # SpecAugment if self.specaug is not None and self.training: xs = self.specaug(xs) # Weight noise injection if self.weight_noise_std > 0 and self.training: self.add_weight_noise(std=self.weight_noise_std) # Input Gaussian noise injection if self.input_noise_std > 0 and self.training: xs = add_input_noise(xs, std=self.input_noise_std) # Sequence summary network if self.ssn is not None: xs = self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs] xs = pad_list(xs, self.pad) xs = self.dropout_emb(self.embed(xs)) # TODO(hirofumi): fix for Transformer # encoder eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead) if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']: for sub in ['sub1', 'sub2']: eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone() eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:] return eout_dict
def collate_fn(self, batch): xs = [] xlens = [] ys = [] ys_hist = [] ys_sub1 = [] ys_sub2 = [] utt_ids = [] speakers = [] sessions = [] text = [] for item in batch: xs.append(item['xs'][0]) xlens.append(item['xlens'][0]) ys.append(item['ys'][0]) ys_hist.append(item['ys_hist'][0]) ys_sub1.append(item['ys_sub1']) ys_sub2.append(item['ys_sub2']) utt_ids.append(item['utt_ids'][0]) speakers.append(item['speakers'][0]) sessions.append(item['sessions'][0]) text.append(item['text']) if self.num_stacks > 1: xs = [stack_frame(x, self.num_stacks, self.num_skips) for x in xs] # Splicing if self.num_splices > 1: xs = [splice(x, self.num_splices, self.num_stacks) for x in xs] data = { 'xs': xs, 'xlens': xlens, 'ys': ys, 'ys_hist': ys_hist, 'ys_sub1': ys_sub1, 'ys_sub2': ys_sub2, 'utt_ids': utt_ids, 'speakers': speakers, 'sessions': sessions, 'text': text } return data
def encode(self, xs, task='all', flip=False): """Encode acoustic or text features. Args: xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]` task (str): all or ys* or ys_sub1* or ys_sub2* flip (bool): if True, flip acoustic features in the time-dimension Returns: enc_outs (dict): """ if 'lmobj' in task: eouts = { 'ys': { 'xs': None, 'xlens': None }, 'ys_sub1': { 'xs': None, 'xlens': None }, 'ys_sub2': { 'xs': None, 'xlens': None } } return eouts else: if self.input_type == 'speech': # Frame stacking if self.n_stacks > 1: xs = [ stack_frame(x, self.n_stacks, self.n_skips) for x in xs ] # Splicing if self.n_splices > 1: xs = [splice(x, self.n_splices, self.n_stacks) for x in xs] xlens = torch.IntTensor([len(x) for x in xs]) # Flip acoustic features in the reverse order if flip: xs = [ torch.from_numpy(np.flip( x, axis=0).copy()).float().cuda(self.device_id) for x in xs ] else: xs = [np2tensor(x, self.device_id).float() for x in xs] xs = pad_list(xs, 0.0) # SpecAugment if self.is_specaug and self.training: xs = self.specaug(xs) # Gaussian noise injection if self.gaussian_noise: xs = add_gaussian_noise(xs) # Sequence summary network if self.ssn is not None: xs += self.ssn(xs, xlens) elif self.input_type == 'text': xlens = torch.IntTensor([len(x) for x in xs]) xs = [ np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs ] xs = pad_list(xs, self.pad) xs = self.embed(xs) # encoder enc_outs = self.enc(xs, xlens, task.split('.')[0]) if self.main_weight < 1 and self.enc_type in [ 'conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer' ]: for sub in ['sub1', 'sub2']: enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone() enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:] return enc_outs
def train(): def adjust_learning_rate(optimizer, lr): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" # lr = args.lr * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr def add_noise(x): dim = x.shape[-1] noise = torch.normal(torch.zeros(dim), 0.075) if x.is_cuda: noise = noise.cuda() x.data += noise prev_loss = 2000 best_model = None lr = args.lr for epoch in range(1, args.epochs): totloss = 0 losses = [] start_time = time.time() # for i, (xs, ys, xlen, ylen) in enumerate(trainset): step = 0 is_new_epoch = 0 tbar = tqdm(total=len(trainset)) while True: # Compute loss in the training set batch, is_new_epoch = trainset.next() if is_new_epoch: break xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens'] xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs] xs = [np2tensor(x).float() for x in xs] xlen = torch.IntTensor([len(x) for x in xs]) xs = pad_list(xs, 0.0).cuda() _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys] ys_out_pad = pad_list(_ys, 0).long().cuda() ylen = np2tensor( np.fromiter([y.size(0) for y in _ys], dtype=np.int32)) #accum_n_tokens += sum([len(y) for y in batch_train['ys']]) if args.cuda: xs = xs.cuda() if args.noise: add_noise(xs) # Change mini-batch depending on task model.train() optimizer.zero_grad() loss = model(xs, ys_out_pad, xlen, ylen) loss.backward() # loss.detach() # Truncate the graph loss = float(loss.data) * len(xlen) totloss += loss losses.append(loss) if args.gradclip: grad_norm = nn.utils.clip_grad_norm(model.parameters(), 200) optimizer.step() step += 1 # //TODO vishay un-hardcode the batch size # print(step, '/68k') if step % args.log_interval == 0 and step > 0: loss = totloss / args.batch_size / args.log_interval logging.info('[Epoch %d Batch %d] train_loss %.2f' % (epoch + args.resume_epoch, step, loss)) totloss = 0 tbar.update(len(batch['xs'])) tbar.close() trainset.reset() losses = sum(losses) / len(trainset) #val_l, wer, cer = eval(epoch) val_l = eval(epoch) # logging.info('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; wer %.2f ; cer %.2f ; lr %.3e' % ( # epoch, time.time() - start_time, losses, val_l, wer, cer, lr # )) logging.info( '[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.3e' % (epoch + args.resume_epoch, time.time() - start_time, losses, val_l, lr)) if val_l < prev_loss: prev_loss = val_l best_model = '{}/params_epoch{:02d}_tr{:.2f}_cv{:.2f}'.format( args.out, epoch + args.resume_epoch, losses, val_l) torch.save(model.state_dict(), best_model) else: torch.save( model.state_dict(), '{}/params_epoch{:02d}_tr{:.2f}_cv{:.2f}_rejected'.format( args.out, epoch + args.resume_epoch, losses, val_l)) model.load_state_dict(torch.load(best_model)) if args.cuda: model.cuda() if args.schedule: lr /= 2 adjust_learning_rate(optimizer, lr)
def eval(epoch): recog_dir = args.out ref_trn_save_path = recog_dir + '/ref_epoch_' + str(epoch) + '.trn' hyp_trn_save_path = recog_dir + '/hyp_epoch_' + str(epoch) + '.trn' wer, cer = 0, 0 n_sub_w, n_ins_w, n_del_w = 0, 0, 0 n_sub_c, n_ins_c, n_del_c = 0, 0, 0 n_word, n_char = 0, 0 pbar = tqdm(total=len(evalset)) f_hyp = open(hyp_trn_save_path, 'w') f_ref = open(ref_trn_save_path, 'w') losses = [] is_new_epoch = 0 # for xs, ys, xlen, ylen in devset: step = 0 while True: batch, is_new_epoch = evalset.next() # if is_new_epoch: # break xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens'] xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs] xs = [np2tensor(x).float() for x in xs] xlen = torch.IntTensor([len(x) for x in xs]) xs = pad_list(xs, 0.0) _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys] ys_out_pad = pad_list(_ys, 0).long() ylen = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32)) # xs = Variable(torch.FloatTens is:open or(xs), volatile=True).cuda() # ys = Variable(torch.LongTensor(ys), volatile=True).cuda() # xlen = Variable(torch.IntTensor(xlen)); ylen = Variable(torch.IntTensor(ylen)) model.eval() #logging.info('================== Evaluation Mode =================') loss = model(xs, ys_out_pad, xlen, ylen) loss = float(loss.data) * len(xlen) losses.append(loss) step += 1 # //TODO vishay un-hardcode the batch size best_hyps_id, _ = model.greedy_decode(xs) # print(batch['text'],len(batch['xs'])) for b in range(len(batch['xs'])): ref = batch['text'][b] hyp = evalset.idx2token[0](best_hyps_id[b]) # hyp = removeDuplicates(hyp) # Write to trn utt_id = str(batch['utt_ids'][b]) speaker = str(batch['speakers'][b]).replace('-', '_') if hyp is None: hyp = "none" f_ref.write(ref + ' (' + speaker + '-' + utt_id + ')\n') f_hyp.write(hyp + ' (' + speaker + '-' + utt_id + ')\n') # logging.info('utt-id: %s' % utt_id) # logging.info('Ref: %s' % ref) # logging.info('Hyp: %s' % hyp) # logging.info('-' * 150) # if 'char' in devset.unit: # //TODO this is only for char unit # Compute WER wer_b, sub_b, ins_b, del_b = compute_wer(ref=ref.split(' '), hyp=hyp.split(' '), normalize=False) wer += wer_b n_sub_w += sub_b n_ins_w += ins_b n_del_w += del_b n_word += len(ref.split(' ')) # Compute CER cer_b, sub_b, ins_b, del_b = compute_wer(ref=list(ref), hyp=list(hyp), normalize=False) cer += cer_b n_sub_c += sub_b n_ins_c += ins_b n_del_c += del_b n_char += len(ref) pbar.update(len(batch['xs'])) if is_new_epoch: break pbar.close() # Reset data counters evalset.reset() wer /= n_word n_sub_w /= n_word n_ins_w /= n_word n_del_w /= n_word cer /= n_char n_sub_c /= n_char n_ins_c /= n_char n_del_c /= n_char logging.info('WER (%s): %.2f %%' % (evalset.set, wer)) logging.info('SUB: %.2f / INS: %.2f / DEL: %.2f' % (n_sub_w, n_ins_w, n_del_w)) logging.info('CER (%s): %.2f %%' % (evalset.set, cer)) logging.info('SUB: %.2f / INS: %.2f / DEL: %.2f' % (n_sub_c, n_ins_c, n_del_c)) # print(step, '/12k dev') return sum(losses) / len(evalset), wer, cer