'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.2f} ({top1.avg:.2f})\t'.format( step, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) # vis.log('Test: [{0}/{1}]\t' # 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' # 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' # 'Acc@1 {top1.val:.2f} ({top1.avg:.2f})\t'.format( # step, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) test_loss = losses.avg test_acc = top1.avg test_losses.append(test_loss) test_accs.append(test_acc) print(' * Acc@1 {top1.avg:.2f}%'.format(top1=top1)) # vis.log(' * Acc@1 {top1.avg:.2f}%'.format(top1=top1)) # vis.plot_many({'test loss': test_loss, 'test acc': test_acc}) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) state = {'args': args, 'model': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1, 'train_losses': train_losses, 'train_accs': train_accs, 'test_losses': test_losses, 'test_accs': test_accs, 'best_acc': best_acc} save_checkpoint(state, is_best) with open(log_name, 'a') as log_file: log_writer = csv.writer(log_file, delimiter=',') log_writer.writerow([epoch + 1, train_loss, train_acc, test_loss, test_acc, best_acc]) print('\nBest accuracy: {:.2f}%'.format(best_acc)) # vis.log('\nBest accuracy: {:.2f}%'.format(best_acc))
def train_val(im_dir, train_file_path, val_file_path, hidden_size, n_layers, act_type, norm_size, n_epochs, batch_size, n_letters, lr, optim_type, momentum, weight_decay, valInterval, device='cpu'): ''' The main training procedure ---------------------------- :param im_dir: path to directory with images :param train_file_path: file list of training image paths and labels :param val_file_path: file list of validation image paths and labels :param hidden_size: a list of hidden size for each hidden layer :param n_layers: number of layers in the MLP :param act_type: type of activation function, can be none, sigmoid, tanh, or relu :param norm_size: image normalization size, (height, width) :param n_epochs: number of training epochs :param batch_size: batch size of training and validation :param n_letters: number of classes, in this task it is 26 English letters :param lr: learning rate :param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta' :param momentum: only used if optim_type == 'sgd' :param weight_decay: the factor of L2 penalty on network weights :param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available ''' # training and validation data loader trainloader = dataLoader(im_dir, train_file_path, norm_size, batch_size) valloader = dataLoader(im_dir, val_file_path, norm_size, batch_size) # TODO 1: initialize the MLP model and loss function # what is the input size of the MLP? # hint 1: we convert an image to a vector as the input of the MLP, # each image has shape [norm_size[0], norm_size[1]] # hint 2: Input parameters for MLP: input_size, output_size, hidden_size, n_layers, act_type model = MLP(norm_size[0] * norm_size[1], n_letters, hidden_size, n_layers, act_type) # loss function cal_loss = CrossEntropyLoss.apply # End TODO 1 # put the model on CPU or GPU model = model.to(device) # optimizer if optim_type == 'sgd': optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) elif optim_type == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'adam': optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay) else: print( '[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta' ) raise NotImplementedError # training # to save loss of each training epoch in a python "list" data structure losses = [] for epoch in range(n_epochs): # set the model in training mode model.train() # to save total loss in one epoch total_loss = 0. #TODO 2: calculate losses and train the network using the optimizer for step, (ims, labels) in enumerate(trainloader): # get a batch of data # step 1: set data type and device ims = ims.to(device) labels = labels.to(device) # step 2: convert an image to a vector as the input of the MLP ims = ims.view(batch_size, norm_size[0] * norm_size[1]) # hint: clear gradients in the optimizer optimizer.zero_grad() # step 3: run the model which is the forward process pred = model(ims) # step 4: compute the loss, and call backward propagation function loss = cal_loss(pred, labels) loss.backward() # step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number # this operation is not differentiable total_loss += loss.item() # step 6: call a function, optimizer.step(), to update the parameters of the model optimizer.step() # End TODO 2 # average of the total loss for iterations avg_loss = total_loss / len(trainloader) losses.append(avg_loss) print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss)) # validation if (epoch + 1) % valInterval == 0: # set the model in evaluation mode model.eval() n_correct = 0. # number of images that are correctly classified n_ims = 0. # number of total images with torch.no_grad( ): # we do not need to compute gradients during validation # calculate losses for validation data and do not need train the network for ims, labels in valloader: # set data type and device ims, labels = ims.to(device), labels.type( torch.float).to(device) # convert an image to a vector as the input of the MLP input = ims.view(ims.size(0), -1) # run the model which is the forward process out = model(input) # get the predicted value by the output using out.argmax(1) predictions = out.argmax(1) # sum up the number of images correctly recognized and the total image number n_correct += torch.sum(predictions == labels) n_ims += ims.size(0) # show prediction accuracy print('Epoch {:02d}: validation accuracy = {:.1f}%'.format( epoch + 1, 100 * n_correct / n_ims)) # save model parameters in a file model_save_path = 'saved_models/recognition.pth'.format(epoch + 1) torch.save( { 'state_dict': model.state_dict(), 'configs': { 'norm_size': norm_size, 'output_size': n_letters, 'hidden_size': hidden_size, 'n_layers': n_layers, 'act_type': act_type } }, model_save_path) print('Model saved in {}\n'.format(model_save_path)) # draw the loss curve plot_loss(losses)
# 'Acc@1 {top1.val:.2f} ({top1.avg:.2f})\t'.format( # step, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) test_loss = losses.avg test_acc = top1.avg test_losses.append(test_loss) test_accs.append(test_acc) print(' * Acc@1 {top1.avg:.2f}%'.format(top1=top1)) # vis.log(' * Acc@1 {top1.avg:.2f}%'.format(top1=top1)) # vis.plot_many({'test loss': test_loss, 'test acc': test_acc}) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) state = { 'args': args, 'model': net.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch + 1, 'train_losses': train_losses, 'train_accs': train_accs, 'test_losses': test_losses, 'test_accs': test_accs, 'best_acc': best_acc } save_checkpoint(state, is_best) with open(log_name, 'a') as log_file: log_writer = csv.writer(log_file, delimiter=',') log_writer.writerow( [epoch + 1, train_loss, train_acc, test_loss, test_acc, best_acc])
acc_val[epoch], loss_val[epoch] = evaluate(net, valloader, device, classes) acc_train[epoch], _ = evaluate(net, trainloader, device, classes, loss_compute=False) # save epoch if epoch % 10 == 9: path_checkpoint = os.path.join(proj_path, "checkpoint", "MLPTwoStep{}".format(option), 'cp_{:03d}.pth'.format(epoch)) torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, path_checkpoint) # save the loss and accuracy of train and validation into numpy array npz file. np.savez(os.path.join(proj_path, "checkpoint", "MLPTwoStep{}".format(option), "training_result_{:03d}.npz".format(epoch_num - 1)), acc_val=acc_val, acc_train=acc_train, loss_val=loss_val, loss_train=loss_train) end_t = time.time() print("Time for training {:.03f} hrs.".format((end_t - start_t) / 3600))
def main(): parser = argparse.ArgumentParser(description='Pytorch example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the training data') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train net = MLP(args.unit, 28*28, 10) # Load designated network weight if args.resume: net.load_state_dict(torch.load(args.resume)) # Set model to GPU if args.gpu >= 0: # Make a specified GPU current device = 'cuda:' + str(args.gpu) net = net.to(device) # Setup a loss and an optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) # Load the MNIST transform = transforms.Compose( [transforms.ToTensor()] ) trainvalset = datasets.MNIST(root='./data', train=True, download=True, transform=transform) # Split train/val n_samples = len(trainvalset) trainsize = int(n_samples * 0.9) valsize = n_samples - trainsize trainset, valset = torch.utils.data.random_split(trainvalset, [trainsize, valsize]) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batchsize, shuffle=True, num_workers=2) valloader = torch.utils.data.DataLoader(valset, batch_size=args.batchsize, shuffle=True, num_workers=2) # Setup result holder x = [] ac_train = [] ac_val = [] # Train for ep in range(args.epoch): # Loop over the dataset multiple times running_loss = 0.0 correct_train = 0 total_train = 0 correct_val = 0 total_val = 0 for i, data in enumerate(trainloader, 0): # Get the inputs; data is a list of [inputs, labels] inputs, labels = data if args.gpu >= 0: inputs = inputs.to(device) labels = labels.to(device) # Reshape the input inputs = inputs.view(-1, 28*28) # Reset the parameter gradients optimizer.zero_grad() # Forward outputs = net(inputs) # Predict the label _, predicted = torch.max(outputs, 1) # Check whether estimation is right c = (predicted == labels).squeeze() for i in range(len(predicted)): correct_train += c[i].item() total_train += 1 # Backward + Optimize loss = criterion(outputs, labels) loss.backward() optimizer.step() # Add loss running_loss += loss.item() # Report loss of the epoch print('[epoch %d] loss: %.3f' % (ep + 1, running_loss)) # Save the model if (ep + 1) % args.frequency == 0: path = args.out + "/model_" + str(ep + 1) torch.save(net.state_dict(), path) # Validation with torch.no_grad(): for data in valloader: images, labels = data if args.gpu >= 0: images = images.to(device) labels = labels.to(device) # Reshape the input images = images.view(-1, 28*28) # Forward outputs = net(images) # Predict the label _, predicted = torch.max(outputs, 1) # Check whether estimation is right c = (predicted == labels).squeeze() for i in range(len(predicted)): correct_val += c[i].item() total_val += 1 # Record result x.append(ep+1) ac_train.append(100 * correct_train / total_train) ac_val.append(100 * correct_val / total_val) print('Finished Training') path = args.out + "/model_final" torch.save(net.state_dict(), path) # Draw graph fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(x, ac_train, label='Training') ax.plot(x, ac_val, label='Validation') ax.legend() ax.set_xlabel("Epoch") ax.set_ylabel("Accuracy [%]") ax.set_ylim(80, 100) plt.savefig(args.out + '/accuracy_mnist_mlp.png')