def test(): args = parser.parse_args() try: conf = yaml.safe_load(open(args.conf, 'r')) except: print("Config file not exist!") sys.exit(1) opts = Config() for k, v in conf.items(): setattr(opts, k, v) print('{:50}:{}'.format(k, v)) use_cuda = opts.use_gpu device = torch.device('cuda:0') if use_cuda else torch.device('cpu') model_path = os.path.join(opts.checkpoint_dir, opts.exp_name, 'ctc_best_model.pkl') package = torch.load(model_path) rnn_param = package["rnn_param"] add_cnn = package["add_cnn"] cnn_param = package["cnn_param"] num_class = package["num_class"] feature_type = package['epoch']['feature_type'] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] mel = opts.mel beam_width = opts.beam_width lm_alpha = opts.lm_alpha decoder_type = opts.decode_type vocab_file = opts.vocab_file vocab = Vocab(vocab_file) test_dataset = SpeechDataset(vocab, opts.test_scp_path, opts.test_lab_path, opts) test_loader = SpeechDataLoader(test_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers, pin_memory=False) model = CTC_Model(rnn_param=rnn_param, add_cnn=add_cnn, cnn_param=cnn_param, num_class=num_class, drop_out=drop_out) model.to(device) model.load_state_dict(package['state_dict']) model.eval() if decoder_type == 'Greedy': decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0) else: decoder = BeamDecoder(vocab.index2word, beam_width=beam_width, blank_index=0, space_idx=-1, lm_path=opts.lm_path, lm_alpha=opts.lm_alpha) total_wer = 0 total_cer = 0 start = time.time() with torch.no_grad(): for data in test_loader: inputs, input_sizes, targets, target_sizes, utt_list = data inputs = inputs.to(device) #rnput_sizes = input_sizes.to(device) #target = target.to(device) #target_sizes = target_sizes.to(device) probs = model(inputs) max_length = probs.size(0) input_sizes = (input_sizes * max_length).long() probs = probs.cpu() decoded = decoder.decode(probs, input_sizes.numpy().tolist()) targets, target_sizes = targets.numpy(), target_sizes.numpy() labels = [] for i in range(len(targets)): label = [ vocab.index2word[num] for num in targets[i][:target_sizes[i]] ] labels.append(' '.join(label)) for x in range(len(targets)): print("origin : " + labels[x]) print("decoded: " + decoded[x]) cer = 0 wer = 0 for x in range(len(labels)): wer += decoder.wer(decoded[x], labels[x]) decoder.num_word += len(labels[x].split()) total_wer += wer print("total_error:", total_wer) print("total_phoneme:", decoder.num_word) PER = (float(total_wer) / decoder.num_word) * 100 print("Phoneme error rate on test set: %.4f" % PER) end = time.time() time_used = (end - start) / 60.0 print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))
predictions.append(beam_decode(tokenized_test[i:i + 1])) for idx, result in enumerate(predictions): if result == '': print(idx) return predictions def submit_proc(sentence): sentence = sentence.lstrip(' ,!。') sentence = sentence.replace(' ', '') if sentence == '': sentence = '随时联系' return sentence if __name__ == '__main__': params = get_params() vocab, reverse_vocab = Vocab() embedding_matrix = load_embedding_matrix() params['mode'] = 'test' predictions = test(params, embedding_matrix, vocab) test_df = pd.read_csv(test_data_file) test_df['Prediction'] = predictions test_df = test_df[['QID', 'Prediction']] test_df['Prediction'] = test_df['Prediction'].apply(submit_proc) test_df.to_csv(os.path.join(root, 'data', 'result_pgn_beam_search.csv'), index=None, sep=',')
def main(conf): opts = Config() for k, v in conf.items(): setattr(opts, k, v) print('{:50}:{}'.format(k, v)) device = torch.device('cuda:0') if opts.use_gpu else torch.device('cpu') torch.manual_seed(opts.seed) np.random.seed(opts.seed) if opts.use_gpu: torch.cuda.manual_seed(opts.seed) #Data Loader vocab = Vocab(opts.vocab_file) train_dataset = SpeechDataset(vocab, opts.train_scp_path, opts.train_lab_path, opts.train_trans_path, opts, True) dev_dataset = SpeechDataset(vocab, opts.valid_scp_path, opts.valid_lab_path, opts.valid_trans_path, opts) train_loader = SpeechDataLoader(train_dataset, batch_size=opts.batch_size, shuffle=opts.shuffle_train, num_workers=opts.num_workers) dev_loader = SpeechDataLoader(dev_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers) #Define Model rnn_type = supported_rnn[opts.rnn_type] rnn_param = { "rnn_input_size": opts.rnn_input_size, "rnn_hidden_size": opts.rnn_hidden_size, "rnn_layers": opts.rnn_layers, "rnn_type": rnn_type, "bidirectional": opts.bidirectional, "batch_norm": opts.batch_norm } num_class = vocab.n_words opts.output_class_dim = vocab.n_words drop_out = opts.drop_out add_cnn = opts.add_cnn cnn_param = {} channel = eval(opts.channel) kernel_size = eval(opts.kernel_size) stride = eval(opts.stride) padding = eval(opts.padding) pooling = eval(opts.pooling) activation_function = supported_activate[opts.activation_function] cnn_param['batch_norm'] = opts.batch_norm cnn_param['activate_function'] = activation_function cnn_param["layer"] = [] for layer in range(opts.layers): layer_param = [ channel[layer], kernel_size[layer], stride[layer], padding[layer] ] if pooling is not None: layer_param.append(pooling[layer]) else: layer_param.append(None) cnn_param["layer"].append(layer_param) model = CTC_Model(add_cnn=add_cnn, cnn_param=cnn_param, rnn_param=rnn_param, num_class=num_class, drop_out=drop_out) model = model.to(device) num_params = 0 for name, param in model.named_parameters(): num_params += param.numel() print("Number of parameters %d" % num_params) for idx, m in enumerate(model.children()): print(idx, m) #Training init_lr = opts.init_lr num_epoches = opts.num_epoches end_adjust_acc = opts.end_adjust_acc decay = opts.lr_decay weight_decay = opts.weight_decay batch_size = opts.batch_size params = { 'num_epoches': num_epoches, 'end_adjust_acc': end_adjust_acc, 'mel': opts.mel, 'seed': opts.seed, 'decay': decay, 'learning_rate': init_lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'feature_type': opts.feature_type, 'n_feats': opts.feature_dim } print(params) loss_fn = nn.CTCLoss(reduction='sum') optimizer = torch.optim.Adam(model.parameters(), lr=init_lr, weight_decay=weight_decay) #visualization for training from visdom import Visdom viz = Visdom() if add_cnn: title = opts.feature_type + str(opts.feature_dim) + ' CNN_LSTM_CTC' else: title = opts.feature_type + str(opts.feature_dim) + ' LSTM_CTC' viz_opts = [ dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'), dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'), dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch') ] viz_window = [None, None, None] count = 0 learning_rate = init_lr loss_best = 1000 loss_best_true = 1000 adjust_rate_flag = False stop_train = False adjust_time = 0 acc_best = 0 start_time = time.time() loss_results = [] dev_loss_results = [] dev_cer_results = [] while not stop_train: if count >= num_epoches: break count += 1 if adjust_rate_flag: learning_rate *= decay adjust_rate_flag = False for param in optimizer.param_groups: param['lr'] *= decay print("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) train_acc, loss = run_epoch(count, model, train_loader, loss_fn, device, optimizer=optimizer, print_every=opts.verbose_step, is_training=True) loss_results.append(loss) acc, dev_loss = run_epoch(count, model, dev_loader, loss_fn, device, optimizer=None, print_every=opts.verbose_step, is_training=False) print("loss on dev set is %.4f" % dev_loss) dev_loss_results.append(dev_loss) dev_cer_results.append(acc) #adjust learning rate by dev_loss if dev_loss < (loss_best - end_adjust_acc): loss_best = dev_loss loss_best_true = dev_loss adjust_rate_count = 0 model_state = copy.deepcopy(model.state_dict()) op_state = copy.deepcopy(optimizer.state_dict()) elif (dev_loss < loss_best + end_adjust_acc): adjust_rate_count += 1 if dev_loss < loss_best and dev_loss < loss_best_true: loss_best_true = dev_loss model_state = copy.deepcopy(model.state_dict()) op_state = copy.deepcopy(optimizer.state_dict()) else: adjust_rate_count = 10 if acc > acc_best: acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) print("adjust_rate_count:" + str(adjust_rate_count)) print('adjust_time:' + str(adjust_time)) if adjust_rate_count == 10: adjust_rate_flag = True adjust_time += 1 adjust_rate_count = 0 if loss_best > loss_best_true: loss_best = loss_best_true model.load_state_dict(model_state) optimizer.load_state_dict(op_state) if adjust_time == 8: stop_train = True time_used = (time.time() - start_time) / 60 print("epoch %d done, cv acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) print('loss_best:', loss_best) #x_axis = range(count) #y_axis = [loss_results[0:count], dev_loss_results[0:count], dev_cer_results[0:count]] #for x in range(len(viz_window)): # if viz_window[x] is None: # viz_window[x] = viz.line(X = np.array(x_axis), Y = np.array(y_axis[x]), opts = viz_opts[x],) # else: # viz.line(X = np.array(x_axis), Y = np.array(y_axis[x]), win = viz_window[x], update = 'replace',) print("End training, best dev loss is: %.4f, acc is: %.4f" % (loss_best, acc_best)) model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) save_dir = os.path.join(opts.checkpoint_dir, opts.exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) best_path = os.path.join(save_dir, 'ctc_best_model.pkl') params['epoch'] = count torch.save( CTC_Model.save_package(model, optimizer=optimizer, epoch=params, loss_results=loss_results, dev_loss_results=dev_loss_results, dev_cer_results=dev_cer_results), best_path)
print('Saving checkpoint for epoch {} at {}'.format( epoch + 1, ckpt_save_path)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) def train(params, embedding_matrix, vocab): print('Building the model ...') model = PGN(params, embedding_matrix, vocab) print('Creating the checkpoint manager ...') checkpoint = tf.train.Checkpoint(model=model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, 'drive/NLP1/data/checkpoints/training_pgn_checkpoints', max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print('Resotred from {}'.format(checkpoint_manager.latest_checkpoint)) else: print('Initializing from scratch ...') print('Start the training process ...') train_model(model, params, vocab, checkpoint_manager) if __name__ == '__main__': params = get_params() vocab = Vocab() embedding_matrix = load_embedding_matrix() train(params, embedding_matrix, vocab)
def test(): args = parser.parse_args() try: conf = yaml.safe_load(open(args.conf, 'r')) except: print("Config file not exist!") sys.exit(1) opts = Config() for k, v in conf.items(): setattr(opts, k, v) print('{:50}:{}'.format(k, v)) use_cuda = opts.use_gpu # use_cuda = False separator = opts.separator if opts.separator else " " device = torch.device('cuda') if use_cuda else torch.device('cpu') model_path = os.path.join(opts.checkpoint_dir, opts.exp_name, 'ctc_best_model.pkl') package = torch.load(model_path, map_location=device) rnn_param = package["rnn_param"] add_cnn = package["add_cnn"] cnn_param = package["cnn_param"] feature_type = package['epoch']['feature_type'] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] mel = opts.mel beam_width = opts.beam_width lm_alpha = opts.lm_alpha decoder_type = opts.decode_type vocab_file = opts.data_file + "/units" if opts.universal: vocab_file = opts.data_file + "/all_units" keywords = [] with open(opts.keyword_path, 'r') as f: for kw in f.readlines(): kw = kw.rstrip("\n") keywords.append(kw) pos_probs = {} neg_probs = {} for kw in keywords: pos_probs[kw] = [] neg_probs[kw] = [] vocab = Vocab(vocab_file) num_class = vocab.n_words test_dataset = SpeechDataset(None, opts.test_scp_path, opts.test_kws_lab_path, opts) test_loader = SpeechDataLoader(test_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers, pin_memory=False) model = CTC_Model(rnn_param=rnn_param, add_cnn=add_cnn, cnn_param=cnn_param, num_class=num_class, drop_out=drop_out) model.to(device) language = opts.data_file.split("/")[1] language_dict = {} with open(opts.language_order) as f: for idx, line in enumerate(f.readlines()): line = line.strip() language_dict[line] = idx language_id = language_dict[language] if opts.from_multi: print("Load from multi") state_dict = package['state_dict'] pretrained_dict = { k: v for k, v in state_dict.items() if k in model.state_dict().keys() } prefix = "fc_list." + str(language_id) language_softmax_dict = { k: v for k, v in state_dict.items() if k.startswith(prefix) } for k, v in language_softmax_dict.items(): new_key = k.replace(prefix, "fc") pretrained_dict[new_key] = v model.load_state_dict(pretrained_dict) else: model.load_state_dict(package['state_dict']) model.eval() if opts.language_one_hot: # add size of one-hot label lid = torch.zeros(len(language_dict.items())) lid[language_id] = 1 ''' Decode Initialize: keywords, blank_index, beam_width inputs: probs, length outputs: probs for each keyword ''' decoder = BeamDecoder(vocab.index2word, beam_width=beam_width, blank_index=0, space_idx=-1, lm_path=opts.lm_path, lm_alpha=opts.lm_alpha) utt_idx = 0 start = time.time() with torch.no_grad(): for data in test_loader: inputs, input_sizes, targets, target_sizes, utt_list = data if opts.language_one_hot: B, T, _ = inputs.shape xx = lid.repeat(B, T, 1) inputs = torch.cat((inputs, xx), dim=-1) inputs = inputs.to(device) #rnput_sizes = input_sizes.to(device) #target = target.to(device) #target_sizes = target_sizes.to(device) probs = model(inputs) max_length = probs.size(0) input_sizes = (input_sizes * max_length).long() probs = probs.cpu() if decoder_type == "soft": decoded = decoder.decode(probs, input_sizes.numpy().tolist(), n_best=True) prob_mat = soft_kwd(decoded, keywords) else: decoded = decoder.decode(probs, input_sizes.numpy().tolist()) # output existence for each keyword prob_mat = exist_kwd(decoded, keywords) # target is a 0-1 matrix targets, target_sizes = targets.numpy(), target_sizes.numpy() for i in range(len(decoded)): for j, kw in enumerate(keywords): if targets[i, j] == 1: pos_probs[kw].append(prob_mat[i, j]) else: neg_probs[kw].append(prob_mat[i, j]) utt_idx += len(decoded) print("Processed {}/{} utterances.".format(utt_idx, len(test_dataset))) expdir = opts.checkpoint_dir + opts.exp_name print("Output to {}".format(expdir)) threshold = 0.5 FPs = {} TPs = {} for item in pos_probs.items(): kw = item[0] probs = item[1] probs = np.array(probs) TPs[kw] = len(probs[probs >= threshold]) with open(expdir + "/" + kw + ".pos", 'w') as f: for prob in probs: f.write(str(prob) + "\n") for item in neg_probs.items(): kw = item[0] probs = item[1] probs = np.array(probs) FPs[kw] = len(probs[probs >= threshold]) with open(expdir + "/" + kw + ".neg", 'w') as f: for prob in probs: f.write(str(prob) + "\n") for kw in keywords: recall = TPs[kw] / (len(pos_probs[kw]) + 1e-8) precision = TPs[kw] / (TPs[kw] + FPs[kw] + 1e-8) print("For keyword {} of threshold {}: Precision {}, Recall {}, F1 {}". format(kw, str(threshold), recall, precision, 2 * (precision * recall) / (precision + recall + 1e-8))) print("kws decode method: {}".format(decoder_type)) end = time.time() time_used = (end - start) / 60.0 print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))
def test(): args = parser.parse_args() try: conf = yaml.safe_load(open(args.conf, 'r')) except: print("Config file not exist!") sys.exit(1) opts = Config() for k, v in conf.items(): setattr(opts, k, v) # print('{:50}:{}'.format(k, v)) use_cuda = opts.use_gpu device = torch.device('cuda') if use_cuda else torch.device('cpu') model_path = os.path.join(opts.checkpoint_dir, opts.exp_name, 'ctc_best_model.pkl') package = torch.load(model_path) rnn_param = package["rnn_param"] add_cnn = package["add_cnn"] cnn_param = package["cnn_param"] num_class = package["num_class"] feature_type = package['epoch']['feature_type'] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] mel = opts.mel beam_width = opts.beam_width lm_alpha = opts.lm_alpha decoder_type = opts.decode_type vocab_file = opts.vocab_file vocab = Vocab(vocab_file) test_dataset = SpeechDataset(vocab, opts.pred_scp_path, opts.pred_lab_path, opts) test_loader = SpeechDataLoader(test_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers, pin_memory=False) model = CTC_Model(rnn_param=rnn_param, add_cnn=add_cnn, cnn_param=cnn_param, num_class=num_class, drop_out=drop_out) model.to(device) model.load_state_dict(package['state_dict']) model.eval() if decoder_type == 'Greedy': decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0) else: decoder = BeamDecoder(vocab.index2word, beam_width=beam_width, blank_index=0, space_idx=-1, lm_path=opts.lm_path, lm_alpha=opts.lm_alpha) # total_wer = 0 # total_cer = 0 # start = time.time() with torch.no_grad(): for data in test_loader: inputs, input_sizes, targets, target_sizes, utt_list = data # os.system("cp ../TIMIT/predict/{}.txt ../output/words.txt".format(utt_list[0])) inputs = inputs.to(device) # rnput_sizes = input_sizes.to(device) # target = target.to(device) # target_sizes = target_sizes.to(device) probs = model(inputs) max_length = probs.size(0) input_sizes = (input_sizes * max_length).long() probs = probs.cpu() decoded = decoder.decode(probs, input_sizes.numpy().tolist()) targets, target_sizes = targets.numpy(), target_sizes.numpy() labels = [] for i in range(len(targets)): label = [ vocab.index2word[num] for num in targets[i][:target_sizes[i]] ] labels.append(' '.join(label)) for x in range(len(targets)): with open("../output/original.txt", "a") as writer: writer.write(utt_list[x] + " " + labels[x] + "\n") with open("../output/predicted.txt", "a") as writer: writer.write(utt_list[x] + " " + decoded[x] + "\n")
def test(): args = parser.parse_args() try: conf = yaml.safe_load(open(args.conf, 'r')) except: print("Config file not exist!") sys.exit(1) opts = Config() for k, v in conf.items(): setattr(opts, k, v) print('{:50}:{}'.format(k, v)) use_cuda = opts.use_gpu separator = opts.separator if opts.separator else " " device = torch.device('cuda') if use_cuda else torch.device('cpu') model_path = os.path.join(opts.checkpoint_dir, opts.exp_name, 'ctc_best_model.pkl') package = torch.load(model_path) rnn_param = package["rnn_param"] add_cnn = package["add_cnn"] cnn_param = package["cnn_param"] feature_type = package['epoch']['feature_type'] n_feats = package['epoch']['n_feats'] drop_out = package['_drop_out'] mel = opts.mel beam_width = opts.beam_width lm_alpha = opts.lm_alpha decoder_type = opts.decode_type vocab_file = opts.data_file + "/units" if opts.universal: vocab_file = opts.data_file + "/all_units" vocab = Vocab(vocab_file) num_class = vocab.n_words test_dataset = SpeechDataset(vocab, opts.test_scp_path, opts.test_lab_path, opts) test_loader = SpeechDataLoader(test_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers, pin_memory=False) model = CTC_Model(rnn_param=rnn_param, add_cnn=add_cnn, cnn_param=cnn_param, num_class=num_class, drop_out=drop_out) model.to(device) language = opts.data_file.split("/")[1] language_dict = {} with open(opts.language_order) as f: for idx, line in enumerate(f.readlines()): line = line.strip() language_dict[line] = idx language_id = language_dict[language] if opts.from_multi: print("Load from multi") state_dict = package['state_dict'] pretrained_dict = { k: v for k, v in state_dict.items() if k in model.state_dict().keys() } prefix = "fc_list." + str(language_id) language_softmax_dict = { k: v for k, v in state_dict.items() if k.startswith(prefix) } for k, v in language_softmax_dict.items(): new_key = k.replace(prefix, "fc") pretrained_dict[new_key] = v model.load_state_dict(pretrained_dict) else: model.load_state_dict(package['state_dict']) model.eval() if opts.language_one_hot: # add size of one-hot label lid = torch.zeros(len(language_dict.items())) lid[language_id] = 1 if decoder_type == 'Greedy': decoder = GreedyDecoder(vocab.index2word, space_idx=-1, blank_index=0) else: decoder = BeamDecoder(vocab.index2word, beam_width=beam_width, blank_index=0, space_idx=-1, lm_path=opts.lm_path, lm_alpha=opts.lm_alpha) total_wer = 0 total_cer = 0 start = time.time() with torch.no_grad(): for data in test_loader: inputs, input_sizes, targets, target_sizes, utt_list = data if opts.language_one_hot: B, T, _ = inputs.shape xx = lid.repeat(B, T, 1) inputs = torch.cat((inputs, xx), dim=-1) inputs = inputs.to(device) #rnput_sizes = input_sizes.to(device) #target = target.to(device) #target_sizes = target_sizes.to(device) probs = model(inputs) max_length = probs.size(0) input_sizes = (input_sizes * max_length).long() probs = probs.cpu() decoded = decoder.decode(probs, input_sizes.numpy().tolist()) targets, target_sizes = targets.numpy(), target_sizes.numpy() labels = [] for i in range(len(targets)): label = [ vocab.index2word[num] for num in targets[i][:target_sizes[i]] ] labels.append(' '.join(label)) for x in range(len(targets)): print("origin : " + labels[x]) print("decoded: " + decoded[x]) cer = 0 wer = 0 for x in range(len(labels)): cer += decoder.cer(decoded[x], labels[x]) wer += decoder.wer(decoded[x], labels[x], separator) decoder.num_word += len(labels[x].split(separator)) decoder.num_char += len(labels[x]) total_cer += cer total_wer += wer CER = (float(total_cer) / decoder.num_char) * 100 WER = (float(total_wer) / decoder.num_word) * 100 print("Character error rate on test set: %.4f" % CER) print("Word error rate on test set: %.4f" % WER) end = time.time() time_used = (end - start) / 60.0 print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))
def main(conf): opts = Config() for k, v in conf.items(): setattr(opts, k, v) print('{:50}:{}'.format(k, v)) device = torch.device('cuda') if opts.use_gpu and torch.cuda.is_available( ) else torch.device('cpu') torch.manual_seed(opts.seed) np.random.seed(opts.seed) if opts.use_gpu: torch.cuda.manual_seed(opts.seed) datasets = os.listdir(opts.data_file) for idx, dataset in enumerate(datasets): datasets[idx] = opts.data_file + "/" + dataset train_scp = "/train/feats.scp" train_lab = "/train/lab.txt" valid_scp = "/dev/feats.scp" valid_lab = "/dev/lab.txt" vocab_f = "/units" if opts.universal: vocab_f = "/all_units" semi = False semi_loader = None #Data Loader vocab = [Vocab(dataset + vocab_f) for dataset in datasets] train_dataset = [ SpeechDataset(voc, dataset + train_scp, dataset + train_lab, opts) for dataset, voc in zip(datasets, vocab) ] dev_dataset = [ SpeechDataset(voc, dataset + valid_scp, dataset + valid_lab, opts) for dataset, voc in zip(datasets, vocab) ] train_loader = [ SpeechDataLoader(dataset, batch_size=opts.batch_size, shuffle=opts.shuffle_train, num_workers=opts.num_workers) for dataset in train_dataset ] dev_loader = [ SpeechDataLoader(dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers) for dataset in dev_dataset ] if opts.semi: semi = True semi_scp = "/train/feats_nolabel.scp" semi_train_dataset = [ UnlabelSpeechDataset(dataset + semi_scp, opts) for dataset in datasets ] semi_loader = [ UnlabelSpeechDataLoader(dataset, batch_size=opts.batch_size, shuffle=opts.shuffle_train, num_workers=opts.num_workers) for dataset in semi_train_dataset ] if opts.language_one_hot: # add size of one-hot label opts.rnn_input_size = opts.rnn_input_size + len(train_dataset) #Define Model rnn_type = supported_rnn[opts.rnn_type] rnn_param = { "rnn_input_size": opts.rnn_input_size, "rnn_hidden_size": opts.rnn_hidden_size, "rnn_layers": opts.rnn_layers, "rnn_type": rnn_type, "bidirectional": opts.bidirectional, "batch_norm": opts.batch_norm } num_class = [voc.n_words for voc in vocab] # opts.output_class_dim = vocab.n_words drop_out = opts.drop_out add_cnn = opts.add_cnn cnn_param = {} channel = eval(opts.channel) kernel_size = eval(opts.kernel_size) stride = eval(opts.stride) padding = eval(opts.padding) pooling = eval(opts.pooling) activation_function = supported_activate[opts.activation_function] cnn_param['batch_norm'] = opts.batch_norm cnn_param['activate_function'] = activation_function cnn_param["layer"] = [] for layer in range(opts.layers): layer_param = [ channel[layer], kernel_size[layer], stride[layer], padding[layer] ] if pooling is not None: layer_param.append(pooling[layer]) else: layer_param.append(None) cnn_param["layer"].append(layer_param) # Domain Adversarial Training if opts.dat_lambda != 0: dat = True else: dat = False if opts.mme_lambda != 0: mme = True else: mme = False model = Multi_CTC_Model(add_cnn=add_cnn, cnn_param=cnn_param, rnn_param=rnn_param, num_class=num_class, drop_out=drop_out, dat=opts.dat_lambda, mme=opts.mme_lambda, universal=opts.universal) model = model.to(device) num_params = 0 for name, param in model.named_parameters(): num_params += param.numel() print("Number of parameters %d" % num_params) for idx, m in enumerate(model.children()): print(idx, m) if opts.resume != '': print("Load ckp from {}".format(opts.resume)) package = torch.load(opts.resume) state_dict = package['state_dict'] pretrained_dict = { k: v for k, v in state_dict.items() if k in model.state_dict().keys() } model_dict = model.state_dict() model_dict.update(pretrained_dict) model.load_state_dict(model_dict) #Training init_lr = opts.init_lr num_epoches = opts.num_epoches end_adjust_acc = opts.end_adjust_acc decay = opts.lr_decay weight_decay = opts.weight_decay batch_size = opts.batch_size params = { 'num_epoches': num_epoches, 'end_adjust_acc': end_adjust_acc, 'mel': opts.mel, 'seed': opts.seed, 'decay': decay, 'learning_rate': init_lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'feature_type': opts.feature_type, 'n_feats': opts.feature_dim } print(params) loss_fn = nn.CTCLoss(reduction='sum', zero_infinity=True) optimizer = torch.optim.Adam(model.parameters(), lr=init_lr, weight_decay=weight_decay) #visualization for training count = 0 learning_rate = init_lr loss_best = 1e6 loss_best_true = 1e6 adjust_rate_flag = False stop_train = False adjust_time = 0 acc_best = 0 start_time = time.time() loss_results = [] dev_loss_results = [] dev_cer_results = [] advT = AT(opts) while not stop_train: if count >= num_epoches: break count += 1 advT.step() if adjust_rate_flag: learning_rate *= decay adjust_rate_flag = False for param in optimizer.param_groups: param['lr'] *= decay print("Start training epoch: %d, learning_rate: %.5f" % (count, learning_rate)) train_acc, loss = run_epoch(count, model, train_loader, loss_fn, device, opts, semi_loader, optimizer=optimizer, print_every=opts.verbose_step, is_training=True, advT=advT) loss_results.append(loss) acc, dev_loss = run_epoch(count, model, dev_loader, loss_fn, device, opts, optimizer=None, print_every=opts.verbose_step, is_training=False, advT=None) print("loss on dev set is %.4f" % dev_loss) dev_loss_results.append(dev_loss) dev_cer_results.append(acc) #adjust learning rate by dev_loss if dev_loss < (loss_best - end_adjust_acc): loss_best = dev_loss loss_best_true = dev_loss adjust_rate_count = 0 model_state = copy.deepcopy(model.state_dict()) op_state = copy.deepcopy(optimizer.state_dict()) elif (dev_loss < loss_best + end_adjust_acc): adjust_rate_count += 1 if dev_loss < loss_best and dev_loss < loss_best_true: loss_best_true = dev_loss model_state = copy.deepcopy(model.state_dict()) op_state = copy.deepcopy(optimizer.state_dict()) else: adjust_rate_count = 10 if acc > acc_best: acc_best = acc best_model_state = copy.deepcopy(model.state_dict()) best_op_state = copy.deepcopy(optimizer.state_dict()) print("adjust_rate_count:" + str(adjust_rate_count)) print('adjust_time:' + str(adjust_time)) if adjust_rate_count == 10: adjust_rate_flag = True adjust_time += 1 adjust_rate_count = 0 if loss_best > loss_best_true: loss_best = loss_best_true model.load_state_dict(model_state) optimizer.load_state_dict(op_state) if adjust_time == 8: stop_train = True time_used = (time.time() - start_time) / 60 print("epoch %d done, cv acc is: %.4f, time_used: %.4f minutes" % (count, acc, time_used)) x_axis = range(count) y_axis = [ loss_results[0:count], dev_loss_results[0:count], dev_cer_results[0:count] ] print("End training, best dev loss is: %.4f, acc is: %.4f" % (loss_best, acc_best)) model.load_state_dict(best_model_state) optimizer.load_state_dict(best_op_state) save_dir = os.path.join(opts.checkpoint_dir, opts.exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) best_path = os.path.join(save_dir, 'ctc_best_model.pkl') params['epoch'] = count torch.save( CTC_Model.save_package(model, optimizer=optimizer, epoch=params, loss_results=loss_results, dev_loss_results=dev_loss_results, dev_cer_results=dev_cer_results), best_path)