def valid(model, device, val_dataloader, start_epoch): model.eval() val_loss = 0.0 total = 0 correct = 0 for i, (data, labels) in enumerate(val_dataloader): with torch.no_grad(): inputs, labels = data.to(device), labels.to(device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() loss = criterion(outputs, labels) val_loss += loss.cpu().numpy() val_loss /= len(val_dataloader.dataset) val_accuracy = 100. * correct / len(val_dataloader.dataset) print('\nValid set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(val_loss, correct, len(val_dataloader.dataset), val_accuracy)) # Logging loss metrics to Vessl vessl.log(step=epoch + start_epoch + 1, row={ 'val_loss': val_loss, 'val_accuracy': val_accuracy }) return val_accuracy
def test(model, device, test_loader, save_image): model.eval() test_loss = 0 correct = 0 test_images = [] with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() test_images.append(vessl.Image( data[0], caption="Pred: {} Truth: {}".format(pred[0].item(), target[0]))) test_loss /= len(test_loader.dataset) test_accuracy = 100. * correct / len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), test_accuracy)) if save_image: vessl.log({ "Examples": test_images, }) return test_accuracy
def train(model_type, model, corpus, train_data, batch_size, bptt, clip, log_interval, dry_run, epoch): # Turn on training mode which enables dropout. model.train() total_loss = 0. train_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if model_type != 'Transformer': hidden = model.init_hidden(batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() if model_type == 'Transformer': output = model(data) output = output.view(-1, ntokens) else: hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), clip) for p in model.parameters(): p.data.add_(p.grad, alpha=-lr) total_loss += loss.item() train_loss += loss.item() if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // bptt, lr, elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if dry_run: break # Logging metrics to Vessl loss = train_loss / (len(train_data) // bptt) vessl.log(step=epoch, row={'loss': loss, 'ppl': math.exp(loss)})
def train(model, device, train_dataloader, optimizer, epoch, start_epoch): model.train() loss = 0 for batch_idx, (data, labels) in enumerate(train_dataloader): inputs, labels = data.to(device), labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() if batch_idx % 128 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch + 1, batch_idx * len(data), len(train_dataloader.dataset), 100. * batch_idx / len(train_dataloader), loss.item())) # Logging loss metrics to Vessl vessl.log(step=epoch + start_epoch + 1, row={'loss': loss.item()})
best_val_loss = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train(model_type, model, corpus, train_data, batch_size, args.bptt, clip, args.log_interval, args.dry_run, epoch) val_loss = evaluate(model_type, model, corpus, val_data, args.bptt) val_ppl = math.exp(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, val_ppl)) print('-' * 89) # Logging metrics to Vessl vessl.log(step=epoch, row={'val_loss': val_loss, 'val_ppl': val_ppl}) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: save(model, args.output_path) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 # Load the best saved model. with open(os.path.join(args.output_path, 'model.pt'), 'rb') as f: model = torch.load(f) # after load the rnn params are not a continuous chunk of memory # this makes them a continuous chunk, and will speed up forward pass # Currently, only rnn model supports flatten_parameters function.
def after_step(self): vessl.log(step=self.trainer.iter, row={'loss': self.trainer.storage.history('total_loss').latest()})