def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] #prefer to do our entire train,test,val split in the code itself as opposed to our previous script # remove these comments #data preprocessing for Qs and As. spacy_en = spacy.load('en') def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, init_token='<s>', eos_token='</s>') analogies_datafields = [("abc", TEXT), ("d", TEXT)] train, val, test = TabularDataset.splits( path="data", # the root directory where the data lies train='ngram_train.csv', validation="ngram_val.csv", test='ngram_test.csv', format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=analogies_datafields) pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt') TEXT.build_vocab( vectors=pretrained_vecs) # specials=['<pad>', '<s>', '</s>'] if args['--cuda'] == 'cpu': torch_text_device = -1 else: torch_text_device = 0 training_iter, val_iter, test_iter = Iterator.splits( (train, val, test), sort_key=lambda x: len(x.abc), batch_sizes=(100, 20, 1), device=torch_text_device, sort_within_batch=True) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=TEXT.vocab) model.train() #sets training = True uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') writer = SummaryWriter('logs') is_better_count = 0 #TODO: Remove this and debug the nonstopping part while True: epoch += 1 for _, data in enumerate(training_iter): (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d train_iter += 1 optimizer.zero_grad() batch_size = src_sents.shape[1] example_losses = model(src_sents, src_lengths, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) writer.add_scalar('Train/AvgLoss', report_loss / report_examples, epoch) writer.add_scalar('Train/AvgPPL', math.exp(report_loss / report_tgt_words), epoch) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl, val_loss = evaluate_ppl( model, val_iter) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f, dev loss %f' % (train_iter, dev_ppl, val_loss), file=sys.stderr) writer.add_scalar('Val/AvgPPL', dev_ppl, epoch) writer.add_scalar('Val/AvgLoss', val_loss, epoch) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) print(hist_valid_scores) print(valid_metric) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) is_better_count = is_better_count + 1 print(is_better_count) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') if is_better_count > 3: print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0) elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0)
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) existing_model = args['--existing-model-path'] start_from_existing_model = existing_model and os.path.isfile( existing_model) if start_from_existing_model: print("load model from {}".format(existing_model), file=sys.stderr) model = NMT.load(existing_model, no_char_decoder=args['--no-char-decoder']) else: print("Create a new model from hyper parameters") model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, no_char_decoder=args['--no-char-decoder']) model.train() print_model_param_count(model) # TODO: How to print all the parameters of this model? And is it useful? if not start_from_existing_model: uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') avg_train_ppls = [] avg_valid_ppls = [] # output_file_path = 'outputs/loss_%s' % datetime.datetime.now().strftime("%m-%d-%Y-%I:%M%p") output_file_path = os.path.join( args['--ppl-save-dir'], 'ppl.json') if args['--ppl-save-dir'] else 'ppl.json' while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, np.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) avg_train_ppls.append(np.exp(report_loss / report_tgt_words)) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: # The printed values are the train loss print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger avg_valid_ppls.append(dev_ppl) valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) output_losses(args, log_every, valid_niter, avg_train_ppls, avg_valid_ppls, output_file_path) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # TODO: len(optimizer.param_groups) == 1 ? Or the below code seems odd # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) output_losses(args, log_every, valid_niter, avg_train_ppls, avg_valid_ppls, output_file_path) exit(0) output_losses(args, log_every, valid_niter, avg_train_ppls, avg_valid_ppls, output_file_path) if args['--is-google-colab'] and epoch % 2 == 0 and os.path.isfile( model_save_path): shutil.copy(model_save_path, args['--ppl-save-dir']) shutil.copy(model_save_path + '.optim', args['--ppl-save-dir']) print("copied model files to google drive!")
def train(): text = Text(config.src_corpus, config.tar_corpus) train_data = Data(config.train_path_src, config.train_path_tar) dev_data = Data(config.dev_path_src, config.dev_path_tar) train_loader = DataLoader(dataset=train_data, batch_size=config.batch_size, shuffle=True, collate_fn=utils.get_batch) dev_loader = DataLoader(dataset=dev_data, batch_size=config.dev_batch_size, shuffle=True, collate_fn=utils.get_batch) parser = OptionParser() parser.add_option("--embed_size", dest="embed_size", default=config.embed_size) parser.add_option("--hidden_size", dest="hidden_size", default=config.hidden_size) parser.add_option("--window_size_d", dest="window_size_d", default=config.window_size_d) parser.add_option("--encoder_layer", dest="encoder_layer", default=config.encoder_layer) parser.add_option("--decoder_layers", dest="decoder_layers", default=config.decoder_layers) parser.add_option("--dropout_rate", dest="dropout_rate", default=config.dropout_rate) (options, args) = parser.parse_args() device = torch.device("cuda:0" if config.cuda else "cpu") #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/01.31_drop0.3_54_21.46508598886769_checkpoint.pth" #print(f"load model from {model_path}", file=sys.stderr) #model = NMT.load(model_path) model = NMT(text, options, device) #model = model.cuda() #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_attention/result/140_164.29781984744628_checkpoint.pth" #print(f"load model from {model_path}", file=sys.stderr) #model = NMT.load(model_path) #model = torch.nn.DataParallel(model) model = model.to(device) model = model.cuda() model.train() optimizer = Optim(torch.optim.Adam(model.parameters())) #optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.hidden_size, config.warm_up_step) #print(optimizer.lr) epoch = 0 valid_num = 1 hist_valid_ppl = [] print("begin training!") while (True): epoch += 1 max_iter = int(math.ceil(len(train_data) / config.batch_size)) with tqdm(total=max_iter, desc="train") as pbar: for src_sents, tar_sents, tar_words_num_to_predict in train_loader: optimizer.zero_grad() batch_size = len(src_sents) now_loss = -model(src_sents, tar_sents) now_loss = now_loss.sum() loss = now_loss / batch_size loss.backward() _ = torch.nn.utils.clip_grad_norm_(model.parameters(), config.clip_grad) #optimizer.updata_lr() optimizer.step_and_updata_lr() pbar.set_postfix({ "epwwoch": epoch, "avg_loss": loss.item(), "ppl": math.exp(now_loss.item() / tar_words_num_to_predict), "lr": optimizer.lr }) #pbar.set_postfix({"epoch": epoch, "avg_loss": loss.item(), "ppl": math.exp(now_loss.item()/tar_words_num_to_predict)}) pbar.update(1) #print(optimizer.lr) if (epoch % config.valid_iter == 0): #if (epoch >= config.valid_iter//2): if (valid_num % 5 == 0): valid_num = 0 optimizer.updata_lr() valid_num += 1 print("now begin validation ...", file=sys.stderr) eav_ppl = evaluate_ppl(model, dev_data, dev_loader) print("validation ppl %.2f" % (eav_ppl), file=sys.stderr) flag = len(hist_valid_ppl) == 0 or eav_ppl < min(hist_valid_ppl) if (flag): print("current model is the best!, save to [%s]" % (config.model_save_path), file=sys.stderr) hist_valid_ppl.append(eav_ppl) model.save( os.path.join( config.model_save_path, f"02.08_window35drop0.2_{epoch}_{eav_ppl}_checkpoint.pth" )) torch.save( optimizer.optimizer.state_dict(), os.path.join( config.model_save_path, f"02.08_window35drop0.2_{epoch}_{eav_ppl}_optimizer.optim" )) if (epoch == config.max_epoch): print("reach the maximum number of epochs!", file=sys.stderr) return
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') # print("train data") # print(train_data_src) # print(len(train_data_tgt)) dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') #print("dev data") #print(dev_data_src) #rint(len(dev_data_tgt)) train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) #print(train_data) #print(dev_data) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) # print("vocab") # print(vocab.src) # print(vocab.tgt) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, no_char_decoder=args['--no-char-decoder']) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 #print(src_sents) #print(src_sents.shape) #print(tgt_sents) #print(tgt_sents.shape) optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay']) print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ sample_rate = 22000 resample_rate = 8000 train_records = 8 max_epoch = 10 vocab = Vocab.load('dataset/vocab_full.json') # train_voices_files, corpus = get_voice_files_and_corpus('dataset/train/wavs', train_records) # voices = load_voices_files(train_voices_files, sample_rate, resample_rate) # train_data = list(zip(voices, corpus)) dev_files, dev_corpus = get_voice_files_and_corpus('dataset/dev', 2) dev_data = list( zip(load_voices_files(dev_files, sample_rate, resample_rate), dev_corpus)) epoch_size = 4 train_batch_size = 2 clip_grad = 5.0 valid_niter = 100 log_every = 10 model_save_path = 'model.bin' model = NMT(embed_size=1024, hidden_size=2048, vocab=vocab) model.train() vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 # device = torch.device("cuda:0") device = torch.device("cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') data_queue = Queue() batch_queue = Queue(1) loss_queue = Queue(2) train_data_to_queue_process = Process(target=load_train_data, args=('dataset/train', train_records, epoch_size, data_queue)) train_data_to_queue_process.start() batch_iter_to_queue_process = Process(target=batch_iter_to_queue2, args=(data_queue, batch_queue, loss_queue, max_epoch, train_batch_size, True)) batch_iter_to_queue_process.start() epoch, voices, tgt_sents = batch_queue.get(True) current_epoch = -1 while voices is not None and tgt_sents is not None: train_iter += 1 optimizer.zero_grad() # voices = load_voices_files(voice_files, sample_rate, resample_rate) # voices = voice_files batch_size = len(voices) example_losses = -model(voices, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size loss_queue.put(report_loss / report_examples) if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < 10: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == 5: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == 3: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * 0.5 print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 epoch, voices, tgt_sents = batch_queue.get() batch_iter_to_queue_process.join() train_data_to_queue_process.join()
def train(args): data_path = args['--data_path'] model_path = args['--model_path'] use_word2vec = args['--use_word2vec'] word2vec_fpath = args['--word2vec_fpath'] loss_info_interval = int(args['--loss_info_interval']) dev_info_interval = int(args['--dev_info_interval']) max_epoch = int(args['--max_epoch']) num_workers = int(args['--num_workers']) corpus_limit = None if args['--corpus_limit']=='None' else int(args['--corpus_limit']) vocab_size = int(args['--vocab_size']) freq_cutoff = int(args['--freq_cutoff']) batch_size = int(args['--batch_size']) hidden_size = int(args['--hidden_size']) atten_size = int(args['--atten_size']) dropout_rate = float(args['--dropout_rate']) embed_size = int(args['--embed_size']) lr = float(args['--lr']) lr_decay = float(args['--lr_decay']) grad_clip = float(args['--grad_clip']) patience = int(args['--patience']) trial = int(args['--trial']) # create vocabulary print("create source and target vocabulary...") es_corpus = load_corpus(data_path+'/train.es', 'es', corpus_limit) vocab_src = VocabEntry.build(es_corpus, vocab_size, freq_cutoff) en_corpus = load_corpus(data_path+'/train.en', 'en', corpus_limit) vocab_tgt = VocabEntry.build(en_corpus, vocab_size, freq_cutoff) vocabulary = Vocab(vocab_src, vocab_tgt) vocabulary.save() # vocabulary = Vocab.load() # create dataloader print("create dataloader...") train_dl = generate_dl(data_path, 'train', vocabulary, batch_size, num_workers, corpus_limit) dev_dl = generate_dl(data_path, 'dev', vocabulary, batch_size, num_workers, corpus_limit) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load pretrained word2vec print("load pretrained word2vec...") if use_word2vec: word2vec_src = load_word2vec(word2vec_fpath+'/cc.es.300.vec', vocabulary.src, device) word2vec_tgt = load_word2vec(word2vec_fpath+'/cc.en.300.vec', vocabulary.tgt, device) word2vec = (word2vec_src, word2vec_tgt) print("size: src -- {} tgt -- {}".format(word2vec_src.size(), word2vec_tgt.size())) else: word2vec = None # create model and optimizer model = NMT(hidden_size, atten_size, vocabulary, dropout_rate, word2vec, embed_size) model.init() # model = NMT.load(model_path, vocabulary) model.to(device) if torch.cuda.device_count() > 1: print("- - - - - - - - >%d GPUs are being used now!!!"%torch.cuda.device_count()) model_parallel = nn.DataParallel(model) opt = torch.optim.Adam(model.parameters(), lr, betas=(0.9, 0.99)) # opt.load_state_dict(torch.load(model_path+'/model.bin.optim', map_location=lambda storage, loc: storage)) print("now we start to train this model ...... batch_size %d"%batch_size) start_time = train_time = time.time() epoch = iter_cum = example_cum = loss_cum = words_cum = sents_cum = 0 n_patience = n_trial = 0 hist_best_score = float('inf') while True: epoch += 1 if epoch>max_epoch: print("reached maximum number of epoches......") exit(0) for source, src_len, target, tgt_len in train_dl: model_parallel.train() # print("input size to model {}, {}, {}, {}".format(source.size(), src_len.size(), target.size(), tgt_len.size())) loss = model_parallel(source.to(device), src_len.to(device), target.to(device), tgt_len.to(device)) # print("output loss size: {}".format(loss.size())) loss = loss.sum() loss_avg = loss / len(tgt_len) loss_avg.backward() # gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) opt.step() opt.zero_grad() iter_cum += 1 example_cum += len(tgt_len) loss_cum += loss.item() words_cum += (tgt_len-1).sum().item() sents_cum += len(tgt_len) if iter_cum % loss_info_interval == 0: pre_time = time.time() print("epoch: %d, iter: %d, cum. example: %d, avg. loss: %.2f, avg. ppl: %.2f, speed: %.2fwords/sec, time_eclapsed: %d sec"% (epoch, iter_cum, example_cum, loss_cum/sents_cum, math.exp(loss_cum/words_cum), words_cum/(pre_time-train_time), pre_time-start_time)) train_time = time.time() loss_cum = words_cum = sents_cum = 0 if iter_cum % dev_info_interval == 0: print("validation begin ......") model_parallel.eval() with torch.no_grad(): loss_dev = words_dev = sents_dev = 0 for source, src_len, target, tgt_len in dev_dl: loss = model_parallel(source.to(device), src_len.to(device), target.to(device), tgt_len.to(device)) loss = loss.sum() loss_dev += loss.item() words_dev += (tgt_len-1).sum().item() sents_dev += len(tgt_len) print("avg. loss: %.2f, avg. ppl: %.2f"%(loss_dev/sents_dev, math.exp(loss_dev/words_dev))) # compare performance with history is_better = hist_best_score > (loss_dev/sents_dev) if is_better: print("model improved, saved to %s ......"%model_path) n_patience = 0 hist_best_score = loss_dev/sents_dev model.save(model_path) torch.save(opt.state_dict(), model_path+'/model.bin.optim') else: n_patience += 1 print("hit # %d patience" % n_patience) print("decay learning rate ......") lr = opt.param_groups[0]['lr'] * lr_decay if n_patience > patience: n_trial += 1 print("hit # %d trial" % n_trial) if n_trial > trial: print("early stop!") exit(0) n_patience = 0 print("load previous best model") params = torch.load(model_path+'/model.bin', map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model.to(device) if torch.cuda.device_count() > 1: model_parallel = nn.DataParallel(model) opt.load_state_dict(torch.load(model_path+'/model.bin.optim', map_location=lambda storage, loc: storage)) for param_group in opt.param_groups: param_group['lr'] = lr
def train(model_config, data_config, output_path, device, epoch_size, max_epoch, batch_size, repeats, decade_rate, clip_grad, log_every, valid_every, learning_rate=0.0005): print('use device: %s' % device, file=sys.stderr) vocab = Vocab.load(data_config["vacab_file"]) model = NMT(vocab=vocab, **model_config) model = model.to(torch.device(device)) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) data_config.pop("vacab_file", None) data_loader = DataLoader(**data_config) batch_queue, loss_queue = data_loader.load_train_data( epoch_size, max_epoch, batch_size, repeats, decade_rate) dev_data = data_loader.load_dev_data() hist_valid_scores = [] train_losses = [] train_iter = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 if os.path.isfile(output_path + '/speech-to-text.model'): print('loading saved model...') params = torch.load(output_path + '/speech-to-text.model', map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) print('restoring parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(output_path + '/speech-to-text.optim')) dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl hist_valid_scores.append(valid_metric) print("saved model ppl: ", dev_ppl) model.train() train_time = begin_time = time.time() epoch, voices, tgt_sents = batch_queue.get(True) while voices is not None and tgt_sents is not None: train_iter += 1 optimizer.zero_grad() # print("received voices:", len(voices)) # print("tgt_sents[0]:", len(tgt_sents[0]), tgt_sents[0]) # print("tgt_sents[1]:", len(tgt_sents[1]), tgt_sents[1]) optimizer.zero_grad() batch_size = len(voices) sample_losses = -model(voices, tgt_sents) batch_loss = sample_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size loss_queue.put(report_loss / report_examples) train_losses.append({ 'epoch': epoch, 'iter': train_iter, 'loss': report_loss / report_examples, 'ppl': math.exp(report_loss / report_tgt_words), 'cum': cum_examples, 'speed': report_tgt_words / (time.time() - train_time) }) if train_iter % log_every == 0: print( 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_every == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % output_path, file=sys.stderr) model.save(output_path + '/speech-to-text.model') torch.save(optimizer.state_dict(), output_path + '/speech-to-text.optim') epoch, voices, tgt_sents = batch_queue.get(True)
def train(args): """ Train the NMT Model. """ train_data_src = read_corpus(args.train_src, source='src') train_data_tgt = read_corpus(args.train_tgt, source='tgt') dev_data_src = read_corpus(args.dev_src, source='src') dev_data_tgt = read_corpus(args.dev_tgt, source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) vocab = Vocab.load(args.vocab_file) model = NMT(embed_size=args.embed_size, hidden_size=args.hidden_size, dropout_rate=args.dropout, vocab=vocab) model.train() if np.abs(args.uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (args.uniform_init, args.uniform_init)) for p in model.parameters(): p.data.uniform_(-args.uniform_init, args.uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args.cuda else "cpu") print('use device: %s' % device) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 batch_num = math.ceil(len(train_data) / args.batch_size) current_iter = 0 for src_sents, tgt_sents in batch_iter(train_data, batch_size=args.batch_size, shuffle=True): current_iter += 1 train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val # omitting leading `<s>` tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % args.log_every == 0: print('epoch %d (%d / %d), iter %d, avg. loss %.2f, avg. ppl %.2f ' 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, current_iter, batch_num, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time)) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % args.valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples)) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...') # compute dev. ppl and bleu dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl)) is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('epoch %d, iter %d: save currently the best model to [%s]' % (epoch, train_iter, args.model_path)) model.save(args.model_path) torch.save(optimizer.state_dict(), args.model_path + '.optim') elif patience < args.patience: patience += 1 print('hit patience %d' % patience) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial) if num_trial == args.max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr) # load model params = torch.load(args.model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(args.model_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == args.max_epoch: print('reached maximum number of epochs!') return
def train(): torch.manual_seed(1) if (config.cuda): torch.cuda.manual_seed(1) args = dict() args['embed_size'] = config.embed_size args['d_model'] = config.d_model args['nhead'] = config.nhead args['num_encoder_layers'] = config.num_encoder_layers args['num_decoder_layers'] = config.num_decoder_layers args['dim_feedforward'] = config.dim_feedforward args['dropout'] = config.dropout args['smoothing_eps'] = config.smoothing_eps text = Text(config.src_corpus, config.tar_corpus) train_data = Data(config.train_path_src, config.train_path_tar) dev_data = Data(config.dev_path_src, config.dev_path_tar) train_loader = DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, collate_fn=utils.get_batch) dev_loader = DataLoader(dataset=dev_data, batch_size=config.dev_batch_size, shuffle=True, collate_fn=utils.get_batch) #train_data_src, train_data_tar = utils.read_corpus(config.train_path) #dev_data_src, dev_data_tar = utils.read_corpus(config.dev_path) device = torch.device("cuda:0" if config.cuda else "cpu") model = NMT(text, args, device) #model = nn.DataParallel(model, device_ids=[0, 1]) model = model.to(device) #model = model.module #model_path = "/home/wangshuhe/shuhelearn/ShuHeLearning/NMT_transformer/result/02.01_1_344.6820465077113_checkpoint.pth" #model = NMT.load(model_path) #model = model.to(device) model.train() optimizer = Optim( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.d_model, config.warm_up_step) #optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.warm_up_step, config.init_lr, config.lr) #optimizer = Optim(torch.optim.Adam(model.parameters())) epoch = 0 history_valid_ppl = [] print("begin training!", file=sys.stderr) while (True): epoch += 1 max_iter = int(math.ceil(len(train_data) / config.train_batch_size)) with tqdm(total=max_iter, desc="train") as pbar: #for batch_src, batch_tar, tar_word_num in utils.batch_iter(train_data_src, train_data_tar, config.train_batch_size): for batch_src, batch_tar, tar_word_num in train_loader: optimizer.zero_grad() now_batch_size = len(batch_src) batch_loss = -model(batch_src, batch_tar, smoothing=True) batch_loss = batch_loss.sum() loss = batch_loss / now_batch_size loss.backward() #optimizer.step() #optimizer.updata_lr() optimizer.step_and_updata_lr() pbar.set_postfix({ "epoch": epoch, "avg_loss": '{%.2f}' % (loss.item()), "ppl": '{%.2f}' % (math.exp(batch_loss.item() / tar_word_num)) }) pbar.update(1) if (epoch % config.valid_iter == 0): print("now begin validation...", file=sys.stderr) eval_ppl = evaluate_ppl(model, dev_data, dev_loader, config.dev_batch_size) print(eval_ppl) flag = len( history_valid_ppl) == 0 or eval_ppl < min(history_valid_ppl) if (flag): print( f"current model is the best! save to [{config.model_save_path}]", file=sys.stderr) history_valid_ppl.append(eval_ppl) model.save( os.path.join(config.model_save_path, f"02.10_{epoch}_{eval_ppl}_checkpoint.pth")) torch.save( optimizer.optimizer.state_dict(), os.path.join(config.model_save_path, f"02.10_{epoch}_{eval_ppl}_optimizer.optim")) if (epoch == config.max_epoch): print("reach the maximum number of epochs!", file=sys.stderr) return
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ long_logfile = "long_logfiles/" + str(time.time()) + "long.txt" train_logfile = "train_logfiles/" + str(time.time()) + "train.txt" dev_logfile = "dev_logfiles/" + str(time.time()) + "dev.txt" f_long = open(long_logfile, "w") f_train = open(train_logfile, "w") # TODO: add hyperparameters args_tuples = [(arg, args[arg]) for arg in args] f_train.write("#args_tuples: %s\n" % args_tuples) for (arg, val) in args_tuples: f_train.write("#%s: %s\n" % (arg, val)) f_train.write("#epoch, train iter, train score\n") f_dev = open(dev_logfile, "w") f_dev.write("#epoch, train iter, dev score, dev accuracy\n") binary = int(args["--num-classes"]) == 2 train_data = load_train_data(perct=float(args["--train-perct"]), binary=binary) dev_data = load_dev_data(dev_perct=float(args["--dev-perct"]), binary=binary) train_batch_size = int(args["--batch-size"]) clip_grad = float(args["--clip-grad"]) valid_niter = int(args["--valid-niter"]) log_every = int(args["--log-every"]) model_save_path = args["--save-to"] embed_size = int(args["--embed-size"]) # TODO: load train data_augmenter based on args data_augmenter = str(args["--data-aug"]).lower() print_and_write("Using data augmentation method: %s" % data_augmenter, f_long) if data_augmenter == "gaussian": data_augmenter = GaussianNoiseDataAugmenter( float(args["--data-aug-amount"]), int(args["--data-aug-nx"])) elif data_augmenter == "identity": data_augmenter = NoisyIdentityDataAugmenter( float(args["--data-aug-amount"]), int(args["--data-aug-nx"])) elif data_augmenter == "swapdim": data_augmenter = EmbedDimensionSwapDataAugmenter( int(args["--data-aug-amount"]), int(args["--data-aug-nx"])) else: data_augmenter = BaseDataAugmenter() # perform augmentation train_data_aug = data_augmenter.augment(train_data) print_and_write( "train size: %d, after aug %d" % (len(train_data[0]), len(train_data_aug)), f_long, ) model = NMT(embed_size=embed_size, hidden_size=int(args["--hidden-size"]), num_classes=int(args["--num-classes"]), dropout_rate=float(args["--dropout"])) model.train() uniform_init = float(args["--uniform-init"]) if np.abs(uniform_init) > 0.0: print_and_write( "uniformly initialize parameters [-%f, +%f]" % (uniform_init, uniform_init), f_long, ) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) device = torch.device("cuda:0" if args["--cuda"] else "cpu") print_and_write("use device: %s" % device, f_long) model = model.to(device) print_and_write("confirming model device %s" % model.device, f_long) optimizer = torch.optim.Adam(model.parameters(), lr=float(args["--lr"])) num_trial = 0 train_iter = patience = cum_loss = report_loss = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print_and_write("begin Maximum Likelihood training", f_long) while True: epoch += 1 for sentences, sentiments in batch_iter(train_data_aug, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() example_losses = -model(sentences, sentiments) # (batch_size,) batch_size = len( example_losses) # in case data augmentation makes returned # number of examples > input batch size batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: # train_accuracy = model.compute_accuracy(sentences, sentiments) print_and_write( "epoch %d, iter %d, avg. loss %.2f, " "cum. examples %d, time elapsed %.2f sec" % ( epoch, train_iter, report_loss / report_examples, cum_examples, time.time() - begin_time, ), f_long, ) f_train.write( "%d, %d, %.2f\n" % (epoch, train_iter, report_loss / report_examples)) train_time = time.time() report_loss = report_examples = 0.0 # perform validation if train_iter % valid_niter == 0: cum_loss = cum_examples = 0.0 valid_num += 1 print_and_write("begin validation ...", f_long) # compute dev dev_score, dev_accuracy = evaluate_dev( model, dev_data, batch_size=5000) # dev batch size can be a bit larger valid_metric = -dev_score # maybe use accuracy instead? print_and_write( "validation: iter %d, dev. score %f, dev. accuracy %f" % (train_iter, dev_score, dev_accuracy), f_long, ) f_dev.write("%d, %d, %f, %f\n" % (epoch, train_iter, dev_score, dev_accuracy)) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) # train_score = evaluate_dev(model, train_data, batch_size=100000) if is_better: patience = 0 print_and_write( "save currently the best model to [%s]" % model_save_path, f_long, ) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + ".optim") elif patience < int(args["--patience"]): patience += 1 print_and_write("hit patience %d" % patience, f_long) if patience == int(args["--patience"]): num_trial += 1 print_and_write("hit #%d trial" % num_trial, f_long) if num_trial == int(args["--max-num-trial"]): print_and_write("early stop!", f_long) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]["lr"] * float( args["--lr-decay"]) print_and_write( "load previously best model and decay learning rate to %f" % lr, f_long, ) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params["state_dict"]) model = model.to(device) print_and_write("restore parameters of the optimizers", f_long) optimizer.load_state_dict( torch.load(model_save_path + ".optim")) # set new lr for param_group in optimizer.param_groups: param_group["lr"] = lr # reset patience patience = 0 if epoch == int(args["--max-epoch"]): print_and_write("reached maximum number of epochs!", f_long) exit(0)
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) # Set counters num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] fwd_time = train_time = begin_time = time.time() # Begin training print('begin Maximum Likelihood training') while True: epoch += 1 # Loop over all data in selection batches for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): # Sentences must be sorted in length (that is number of words) src_sents = sorted(src_sents, key=lambda e: len(e), reverse=True) tgt_sents = sorted(tgt_sents, key=lambda e: len(e), reverse=True) train_iter += 1 # Zero out gradients, pytorch accumulates them optimizer.zero_grad() # Get loss train_batch_losses = (-model.forward(src_sents, tgt_sents)) batch_loss = train_batch_losses.sum() loss = batch_loss / train_batch_size # Get gradients loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) # step optimizer.step() # Report progress batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val # Get some report metric tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += train_batch_size cum_examples += train_batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # Test saving and loading the model # test_save_load_model(model=model,optimizer=optimizer) # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f, cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu #dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger dev_ppl = evaluate_ppl(model, dev_data, batch_size=train_batch_size * 2) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model #params = torch.load(model_save_path, map_location=lambda storage, loc: storage) # See https://github.com/pytorch/pytorch/issues/7415 and # https://discuss.pytorch.org/t/on-a-cpu-device-how-to-load-checkpoint-saved-on-gpu-device/349 and # https://github.com/pytorch/pytorch/issues/9139 params = torch.load(model_save_path, map_location='cpu') model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) # optimizer.load_state_dict(torch.load(model_save_path + '.optim') optimizer.load_state_dict( torch.load(model_save_path + '.optim', map_location='cpu')) optimizer_to(optimizer, device) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def experiement(args: Dict, test_only, device): """ Train and Test the NMT Model. @param args (Dict): args from cmd line """ # train_data_src = read_corpus(args['--train-src'], source='src') # train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') # # dev_data_src = read_corpus(args['--dev-src'], source='src') # dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') # # train_data = list(zip(train_data_src, train_data_tgt)) # dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] use_pos_embed = False if args['--use-pos-embed']: use_pos_embed = True use_copy = False if args['--use-copy']: use_copy = True SRC, TRG, train_iterator, dev_iterator, test_iterator = load_data( args['--train-data'], args['--dev-data'], args['--test-data'], device, train_batch_size, (use_pos_embed or use_copy)) vocab = Vocab(SRC, TRG) model = NMT(src_embed_size=int(args['--src-embed-size']), dst_embed_size=int(args['--dst-embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, use_pos_embed=use_pos_embed, use_copy=use_copy) model.load_pretrained_embeddings(vocab) # print("args: {}".format(args)) uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) # def init_weights(m): # for name, param in m.named_parameters(): # if 'weight' in name: # nn.init.normal_(param.data, mean=0, std=0.01) # else: # nn.init.constant_(param.data, 0) # # model.apply(init_weights) # vocab_mask = torch.ones(len(vocab.tgt)) # vocab_mask[vocab.tgt['<pad>']] = 0 print('use device: %s' % device, file=sys.stderr) print(model) para_count = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {para_count:,} trainable parameters') print("file path: {}".format(model_save_path)) if test_only: model.eval() decode(args, test_iterator, vocab, device) exit(0) # perform training model.train() model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 #perform training model.train() # for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): for i, batch in enumerate(train_iterator): train_iter += 1 optimizer.zero_grad() src_sents, src_sents_lens = batch.src tgt_sents = batch.trg batch_size = src_sents.shape[1] example_losses = -model(src_sents, src_sents_lens, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size # if train_iter % log_every == 0: # print("") print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation # model.eval() # if train_iter % valid_niter == 0: # print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, # cum_loss / cum_examples, # np.exp(cum_loss / cum_tgt_words), # cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 # print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_iterator, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len( hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 # print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) # exit(0) break # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) break # perform testing model.eval() decode(args, test_iterator, vocab, device)
def train(index): torch.manual_seed(1) if (config.cuda): torch.cuda.manual_seed(1) device = torch.device(f"cuda:{index}" if config.cuda else "cpu") dist_rank = index torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=dist_rank, world_size=1) is_master_node = (dist_rank == 0) args = dict() args['embed_size'] = config.embed_size args['d_model'] = config.d_model args['nhead'] = config.nhead args['num_encoder_layers'] = config.num_encoder_layers args['num_decoder_layers'] = config.num_decoder_layers args['dim_feedforward'] = config.dim_feedforward args['dropout'] = config.dropout args['smoothing_eps'] = config.smoothing_eps text = Text(config.src_corpus, config.tar_corpus) model = NMT(text, args, device) model = make_data_parallel(model, device) train_data = Data(config.train_path_src, config.train_path_tar) dev_data = Data(config.dev_path_src, config.dev_path_tar) train_sampler = DistributedSampler(train_data) dev_sampler = DistributedSampler(dev_data) train_loader = DataLoader(dataset=train_data, batch_size=int(config.train_batch_size/8), shuffle=False, num_workers=9, pin_memory=True, sampler=train_sampler, collate_fn=utils.get_batch) dev_loader = DataLoader(dataset=dev_data, batch_size=int(config.dev_batch_size/8), shuffle=False, num_workers=9, pin_memory=True, sampler=dev_sampler, collate_fn=utils.get_batch) model.train() optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.d_model, config.warm_up_step) epoch = 0 history_valid_ppl = [] print("begin training!", file=sys.stderr) while (True): epoch += 1 train_loader.sampler.set_epoch(epoch) max_iter = int(math.ceil(len(train_data)/config.train_batch_size)) with tqdm(total=max_iter, desc="train") as pbar: for batch_src, batch_tar, tar_word_num in train_loader: optimizer.zero_grad() now_batch_size = len(batch_src) batch_loss = -model(batch_src, batch_tar, smoothing=True) batch_loss = batch_loss.sum() loss = batch_loss / now_batch_size loss.backward() torch.distributed.barrier() optimizer.step_and_updata_lr() if (is_master_node): pbar.set_postfix({"epoch": epoch, "avg_loss": '{%.2f}' % (loss.item()), "ppl": '{%.2f}' % (batch_loss.item()/tar_word_num)}) pbar.update(1) if (epoch % config.valid_iter == 0): print("now begin validation...", file=sys.stderr) torch.distributed.barrier() eval_ppl = evaluate_ppl(model, dev_data, dev_loader, config.dev_batch_size, is_master_node) print(eval_ppl) flag = len(history_valid_ppl) == 0 or eval_ppl < min(history_valid_ppl) if (flag): print(f"current model is the best! save to [{config.model_save_path}]", file=sys.stderr) history_valid_ppl.append(eval_ppl) model.save(os.path.join(config.model_save_path, f"02.19_{epoch}_{eval_ppl}_checkpoint.pth")) torch.save(optimizer.optimizer.state_dict(), os.path.join(config.model_save_path, f"02.19_{epoch}_{eval_ppl}_optimizer.optim")) if (epoch == config.max_epoch): print("reach the maximum number of epochs!", file=sys.stderr) return
def train(args: Dict): """ Train the NMT Model. :param Dict args: arguments from command line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') # Lists of (src_sent, tgt_sent) tuples train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, no_char_decoder=args['--no-char-decoder']) model.train() # Set to train mode uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) # Initialize in-place # vocab_mask = torch.ones(len(vocab.tgt)) # vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) print('device name: ', torch.cuda.get_device_name(device)) print('device available: ', torch.cuda.is_available()) model = model.to(device) # Send model parameters to the chosen device optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) # Initialize necessary variables num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 # To keep track of previous scores hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') # Load the previous model parameters if they exist if os.path.isfile('model.bin'): print('Load previous best model...', file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(model_save_path + '.optim')) # save the loaded previous best model as current best - otherwise they will be overwritten dev_ppl = evaluate_ppl( model, dev_data, batch_size=64) # dev batch size can be a bit larger valid_metric = -dev_ppl hist_valid_scores.append(valid_metric) while True: epoch += 1 # Iterate over lazily generated batches (lists) of sentences (each sentence is a list of words) for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 # Zero gradients, otherwise they would be accumulated across batches optimizer.zero_grad() batch_size = len(src_sents) # Calculate the losses for each example in the batch (i.e. forward propagation) example_losses = -model(src_sents, tgt_sents) # Dim: (batch_size,) # Average losses over the entire batch batch_loss = example_losses.sum() loss = batch_loss / batch_size # Compute gradients loss.backward() # Clip gradients grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) # Update parameters optimizer.step() # Get a number from a tensor containing a single scalar batch_losses_val = batch_loss.item() # Add to the ''report_loss'' (zeroed every ''log_every'' iterations - for logging) report_loss += batch_losses_val # Add to the ''cum_loss'' (zeroed every ''valid_niter'' iterations - for validation) cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # Perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl dev_ppl = evaluate_ppl( model, dev_data, batch_size=64) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) # Lower perplexity is better is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def train(train_data, dev_data, vocab, embed_size=256, hidden_size=256, dropout_rate=0.2, uniform_init=0.1, device='cpu', lr=0.001, batch_size=32, clip_grad=5.0, log_every=10, valid_niter=2000, save_path='model.bin', patience=5, lr_decay=0.5, max_trials=5, max_epochs=30): """ Train the NMT model Params: train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for training dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentences for dev vocab (Vocab): Vocab object for source and target embed_size (int): Embedding dimensionality. Default = 256 hidden_size (int): Dimensionality for hidden states. Default = 256 dropout_rate (float): Dropout probability. Default: 0.2 uniform_init (float): If > 0: uniformly initialize all parameters device (str): device to perform the calc on. Default = 'cpu' lr (float): learning rate. Default = 0.001 batch_size (int): batch size. Default = 32 clip_grad (float): used in gradient clipping. Default = 5.0 log_every (int): number of iterations to print stats. Default = 10 valid_niter (int): number of iterations to perform validation. Default = 2000 save_path (str): path to save the best model. Default: 'model.bin' in current dir patience (int): number of iterations to decay learning rate. Default = 5 lr_decay (float): learning rate decay. Default = 0.5 max_trials (int): terminate training after how many trials. Default = 5 max_epochs (int): max number of epochs. Default = 30 Return: """ # Create NMT model and put it in train mode model = NMT(embed_size, hidden_size, vocab, dropout_rate) model.train() # Uniformely initialize model parameters if required if np.abs(uniform_init) > 0.: print(f'uniformly init parameters [-{uniform_init}, +{uniform_init}]', file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) # Create target vocab mask with 0 for 'padding' index and 1 otherwise vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 # Set model device device = torch.device(device) model = model.to(device) print(f'Using device: {device}', file=sys.stderr) # Choose optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Initializations num_trial = 0 train_iter = 0 current_patience = 0 cum_loss = 0 report_loss = 0 cum_tgt_words = 0 report_tgt_words = 0 cum_examples = 0 report_examples = 0 epoch = 0 valid_num = 0 hist_valid_scores = [] train_time = time.time() begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 # Iterate over the batches in the training data for src_sents, tgt_sents in batch_iter(train_data, batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() current_batch_size = len(src_sents) # Calculate loss and backpropagate example_losses = -model(src_sents, tgt_sents) # (current_batch_size,) batch_loss = example_losses.sum() loss = batch_loss / current_batch_size # average loss loss.backward() # clip gradient and update parameters _ = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print(f'epoch {epoch}, iter {train_iter}, ' \ f'avg. loss {report_loss / report_examples:.2f}, '\ f'avg. ppl {math.exp(report_loss / report_tgt_words):.2f}, ' \ f'cum. examples {cum_examples}, ' \ f'speed {report_tgt_words / (time.time() - train_time):.2f} words/sec, ' \ f'time elapsed {(time.time() - begin_time):.2f} sec', file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print(f'epoch {epoch}, iter {train_iter}, cum. loss {cum_loss / cum_examples:.2f}, '\ f'cum. ppl {np.exp(cum_loss / cum_tgt_words):.2f} cum. examples {cum_examples}', file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl( model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print(f'validation: iter {train_iter}, dev. ppl {dev_ppl}', file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: # save model and optimizer state print(f'save the best model to [{save_path}]', file=sys.stderr) model.save(save_path) torch.save(optimizer.state_dict(), save_path + '.optim') current_patience = 0 elif current_patience < patience: current_patience += 1 print(f'hit patience {current_patience}', file=sys.stderr) if current_patience == patience: num_trial += 1 print(f'hit #{num_trial} trial', file=sys.stderr) if num_trial == max_trials: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay print( f'load previously best model and decay learning rate to {lr}', file=sys.stderr) # load model params = torch.load( save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience current_patience = 0 if epoch == max_epochs: print('reached maximum number of epochs!', file=sys.stderr) exit(0)