def trainEpochs(encoder, decoder, encoder_optimizer, decoder_optimizer, encoder_scheduler, decoder_scheduler, criterion, dataiter, args): n_epochs = args.n_epochs print_every = args.print_every plot_every = args.plot_every start = time.time() batch_i = 0 n_batches = n_epochs * len(dataiter) plot_losses = [] epoch_loss = 0 # Reset every epoch print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every for epoch in range(args.n_epochs): for input_tensor, input_lengths, target_tensor, target_lengths in dataiter: batch_i += 1 loss = train(input_tensor, input_lengths, target_tensor, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, args) epoch_loss += loss print_loss_total += loss plot_loss_total += loss if batch_i % args.print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, batch_i / n_batches), batch_i, batch_i / n_batches * 100, print_loss_avg)) if batch_i % args.plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 if (epoch + 1) % args.save_every == 0: checkpoint = { 'epoch': epoch, 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'encoder_optim_state': encoder_optimizer.state_dict(), 'decoder_optim_state': decoder_optimizer.state_dict(), } torch.save(checkpoint, args.save_data_path + "/epoch{}_checkpoint.pt".format(epoch)) # for testing only if args.n_batches > 0 and batch_i == args.n_batches: break encoder_scheduler.step(epoch_loss) decoder_scheduler.step(epoch_loss) epoch_loss = 0 dataiter.reset() print("Epoch {}/{} finished".format(epoch, args.n_epochs - 1)) showPlot(plot_losses, args)
def save_model(encoder, decoder, plot_losses, model_name): stamp = str(time.time()) savepath = utils.prepare_dir(model_name, stamp) torch.save(encoder.state_dict(), savepath + "/%s.encoder" % stamp) torch.save(decoder.state_dict(), savepath + "/%s.decoder" % stamp) try: utils.showPlot(plot_losses, model_name, stamp) except: pass print(" * model save with time stamp: ", stamp)
def trainIters(pairs, input_lang, output_lang, encoder, decoder, n_iters, print_every=100, plot_every=1000, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # reset every print_every plot_loss_total = 0 # reset every print_every # define criterion and optimization algorithm encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) training_pairs = [ variablesFromPair(random.choice(pairs), input_lang, output_lang) for i in range(n_iters) ] criterion = nn.NLLLoss() # now proceed one iteration at a time for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_variable = training_pair[0] target_variable = training_pair[1] # train on one example loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (utils.timeSince(start, float(iter) / float(n_iters)), iter, float(iter) / float(n_iters) * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / float(plot_every) plot_losses.append(plot_loss_avg) plot_loss_total = 0 # plot the learning curve utils.showPlot(plot_losses)
def trainIters(lang, dataSet, pairs, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) # 随机获取训练的数据集 training_pairs = [random.choice(pairs) for i in range(n_iters)] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_variable = training_pair[0] target_variable = training_pair[1] loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss # if print_loss_total / print_every <= 0.0003: # break if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, float( iter / n_iters)), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 torch.save(encoder, setting.MODEL_HOME + "/%s.%s.encoder.pkl" % (dataSet, lang)) torch.save(decoder, setting.MODEL_HOME + "/%s.%s.decoder.pkl" % (dataSet, lang)) showPlot(plot_losses)
def train(self, pairs, n_iters, max_length=1000, teacher_forcing_ratio=0.5, print_every=1000, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=learning_rate) training_pairs = [ tensorsFromPair(self.input_lang, self.output_lang, random.choice(pairs), self.device) for i in range(n_iters) ] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = self.step(input_tensor, target_tensor, encoder_optimizer, decoder_optimizer, criterion, max_length, teacher_forcing_ratio) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, lang_pack=None): assert not (lang_pack == None), "None shall pass" input_lang, output_lang, pairs = lang_pack start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) training_pairs = [ tensorsFromPair(random.choice(pairs), langs=[input_lang, output_lang]) for i in range(n_iters) ] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def trainIters(self, pairs, input_lang, output_lang, n_iters, print_every=1000, plot_every=100, char=False): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every self.input_lang = input_lang self.output_lang = output_lang self.encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=self.learning_rate) selected_pairs = [random.choice(pairs) for i in range(n_iters)] training_pairs = [ self.tensorsFromPair(pair, char) for pair in selected_pairs ] self.criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = self.train(input_tensor, target_tensor) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def trainIters(pairs, input_lang, output_lang, encoder, decoder, n_iters, print_every=100, plot_every=1000, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # reset every print_every plot_loss_total = 0 # reset every print_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) training_pairs = [ variablesFromPair(random.choice(pairs), input_lang, output_lang) for i in range(n_iters) ] criterion = nn.NLLLoss() reward = 0 for iter in range(1, n_iters + 1): # print("iter", iter) training_pair = training_pairs[iter - 1] input_variable = training_pair[0] target_variable = training_pair[1] # train on one example loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, reward) # print("loss", loss) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (utils.timeSince(start, float(iter) / float(n_iters)), iter, float(iter) / float(n_iters) * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / float(plot_every) plot_losses.append(plot_loss_avg) plot_loss_total = 0 # if iter % 80 == 0: # train_edit, pair_rand = generateRandomlytrain(encoder, decoder) # if train_edit > 1: # trainItersreinforce(pair_rand, input_lang, output_lang, encoder, decoder, 1, 100, 1000, 0.01) # else: # continue if iter % 100 == 0: train_edit = generateRandomlytrain(encoder, decoder) if train_edit > 2: reward = -(loss * 0.75) else: reward = (loss * 0.75) # print("loss", loss) # print("loss2", reward) # plot the learning curve utils.showPlot(plot_losses)
def trainModel(n_iters=100000, teacher_forcing_ratio=0., print_every=1000, plot_every=100, learning_rate=0.01, max_length=MAX_LENGTH): training_pairs, vocab_size, word2ix, ix2word = loadDataset() encoder, decoder = loadModel(vocab_size) print("Training the model ... ") start = time.time() plot_losses = [] print_loss_total = 0 # reset every print_every plot_loss_total = 0 # reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_variable = training_pair['input'] target_variable = training_pair['target'] input_variable = Variable(torch.LongTensor(input_variable).view(-1, 1)) target_variable = Variable( torch.LongTensor(target_variable).view(-1, 1)) if USE_CUDA: input_variable = input_variable.cuda() target_variable = target_variable.cuda() print(input_variable) loss = trainIter(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_length, teacher_forcing_ratio=teacher_forcing_ratio) print_loss_total += loss plot_loss_total += loss # Keeping track of average loss and printing results on screen if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (utils.timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) # Keeping track of average loss and plotting in figure if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) if min(plot_losses) == plot_loss_avg: #we save this version of the model torch.save(encoder.state_dict(), "encoder.ckpt") torch.save(decoder.state_dict(), "decoder.ckpt") plot_loss_total = 0 utils.showPlot(plot_losses)
def train(model, tokenizer, epochs, batch_size, save_every=1000, plot_every=100, learning_rate=0.01): start = time.time() model_save_dir = os.path.join(args.save_dir, 'seq2seq_attn', datetime.now().strftime('%Y-%m-%d_%H%M')) tb_writer = SummaryWriter(model_save_dir) plot_losses = [] save_every_total = 0 # Reset every save_every plot_loss_total = 0 # Reset every plot_every optimizer = optim.Adam(model.parameters(), lr=learning_rate) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) model.apply(weight_init) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) train_iterator = trange(int(epochs), desc="Epoch") best_rouge = 0. early_stopping_steps = 0 global_step = 1 n_iters = len(train_dataloader) logger.info('train and eval') model.zero_grad() for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() text_tensor = batch[0].to(device) token_type_tensor = batch[1].to(device) question_tensor = batch[2].to(device) output, attention, result = model(text_tensor, token_type_tensor, question_tensor, teacher_forcing_ratio=0.5) # output = [batch size,trg sen len, output dim] # trg = [batch size,trg sen len] output = output[1:].view(-1, output.shape[-1]) question_tensor = question_tensor[:, 1:, ] trg = question_tensor.reshape(-1) # output = [(trg sent len - 1) * batch size, output dim] # trg = [(trg sent len - 1) * batch size] loss = loss_calc(output, trg) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) plot_loss_total += loss.cpu().detach().item() optimizer.step() model.zero_grad() if global_step % save_every == 0: valid_loss, rouge_l = evaluate(model, tokenizer, batch_size, max_question_len) tb_writer.add_scalar('valid_loss', valid_loss, global_step) tb_writer.add_scalar('valid_rouge_l', rouge_l, global_step) print('%s (%d %d%%) loss: %.4f rouge_l: %.4f' % (timeSince(start, step / n_iters), step, step / n_iters * 100, valid_loss, rouge_l)) if best_rouge < rouge_l: logger.info('save best weight') best_rouge = rouge_l torch.save( model.state_dict(), os.path.join(model_save_dir, 'pytorch_model.bin')) early_stopping_steps = 0 else: early_stopping_steps += 1 if args.early_stopping > 0 and early_stopping_steps >= args.early_stopping: break if global_step % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) tb_writer.add_scalar('train_loss', plot_loss_avg, global_step) plot_loss_total = 0 showPlot(plot_losses) global_step += 1 if args.early_stopping > 0 and early_stopping_steps >= args.early_stopping: break tb_writer.close() return global_step, loss