def evaluate(split): # Turn on evaluation mode which disables dropout. global ntokens model.eval() total_loss, nbatches = 0, 0 # ntokens = len(corpus.dictionary.idx2word) if not args.lm1b else ntokens hidden = model.init_hidden(args.eval_batch_size) if not args.lm1b: data_gen = corpus.iter(split, args.eval_batch_size, args.bptt, use_cuda=args.cuda) else: data_gen = test_corpus.batch_generator(seq_length=args.bptt, batch_size=eval_batch_size, shuffle=False) for item in data_gen: if args.lm1b: source, target, word_cnt, batch_num = get_batch(item) else: source, target = item model.softmax.set_target(target.data.view(-1)) output, hidden = model(source, hidden) total_loss += criterion(output, target.view(-1)).data.sum() hidden = repackage_hidden(hidden) nbatches += 1 return total_loss / nbatches
def train(): global lr, best_val_loss # Turn on training mode which enables dropout. model.train() total_loss, nbatches = 0, 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) if not args.lm1b: data_gen = corpus.iter('train', args.batch_size, args.bptt, use_cuda=args.cuda) else: data_gen = train_corpus.batch_generator(seq_length=args.bptt, batch_size=args.batch_size) for b, batch in enumerate(data_gen): model.train() if args.lm1b: source, target, word_cnt, batch_len = get_batch(batch) else: source, target = batch # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() # optimizer.zero_grad() model.softmax.set_target(target.data.view(-1)) output, hidden = model(source, hidden) loss = criterion(output, target.view(-1)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # for p in model.parameters(): # if p.grad is not None: # p.data.add_(-lr, p.grad.data) total_loss += loss.data.cpu() # logging.info(total_loss) if b % args.log_interval == 0 and b > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time if not args.valid_per_epoch: val_loss = evaluate('valid') logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format( epoch, b, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), val_loss, math.exp(val_loss))) else: logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} '.format( epoch, b, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): global lr, best_val_loss # Turn on training mode which enables dropout. model.train() total_loss, nbatches = 0, 0 start_time = time.time() ntokens = len(corpus.dictionary.idx2word) hidden = model.init_hidden(args.batch_size) for b, batch in enumerate( corpus.iter('train', args.batch_size, args.bptt, use_cuda=args.cuda)): model.train() source, target = batch # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() model.softmax.set_target(target.data.view(-1)) output, hidden = model(source, hidden) loss = criterion(output, target.view(-1)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): if p.grad is not None: p.data.add_(-lr, p.grad.data) total_loss += loss.data.cpu() if b % args.log_interval == 0 and b > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time val_loss = evaluate('valid') print( '| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | valid loss {:5.2f} | valid ppl {:8.2f}' .format(epoch, b, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), val_loss, math.exp(val_loss))) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open(args.save, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr *= args.ar total_loss = 0 start_time = time.time()
def evaluate(split): # Turn on evaluation mode which disables dropout. model.eval() total_loss, nbatches = 0, 0 ntokens = len(corpus.dictionary.idx2word) hidden = model.init_hidden(args.eval_batch_size) for source, target in corpus.iter(split, args.eval_batch_size, args.bptt, use_cuda=args.cuda): model.softmax.set_target(target.data.view(-1)) output, hidden = model(source, hidden) total_loss += criterion(output, target.view(-1)).data.sum() hidden = repackage_hidden(hidden) nbatches += 1 return total_loss / nbatches