def train(output_filename, model_type, hidden_size, loss_type, norm_type, sigma_noise): train_data = torchvision.datasets.MNIST( root='datasets/mnist/', train=True, transform=torchvision.transforms.ToTensor(), download=False, ) train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True) if loss_type == 'l2': loss_func = nn.MSELoss() elif loss_type == 'cross_entropy': loss_func = F.binary_cross_entropy if model_type == 'AE': model = AutoEncoder(hidden_size).cuda() elif model_type == 'LTAE': model = LatentAutoEncoder(hidden_size, norm_type, sigma=sigma_noise).cuda() model.set_device() elif model_type == 'VAE': model = VariationalAE(hidden_size).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) model.train() for epoch in range(EPOCH): for step, (x, _) in enumerate(train_loader): optimizer.zero_grad() x_batch = x.view(-1, 28 * 28).cuda() y_batch = x.view(-1, 28 * 28).cuda() if model_type == 'AE': _, decoded = model(x_batch) loss = loss_func(decoded, y_batch) elif model_type == 'LTAE': _, latent, transformed, decoded = model(x_batch) loss = loss_func(decoded, y_batch) loss += torch.nn.functional.mse_loss(transformed, latent) elif model_type == 'VAE': decoded, mu, logvar = model(x_batch) loss = loss_func_vae(decoded, x_batch, mu, logvar, loss_type) loss.backward() optimizer.step() if epoch % 10 == 0: print('Epoch: ', epoch, '| train loss: %.4f' % loss.detach().cpu()) torch.save({'state_dict': model.state_dict()}, f'./saved_models/{output_filename}')
def main(args): device = torch.device( 'cuda' if torch.cuda.is_available() and not args.cpu else 'cpu') print('Using %s device.' % device) world_size = int( os.environ[args.env_size]) if args.env_size in os.environ else 1 local_rank = int( os.environ[args.env_rank]) if args.env_rank in os.environ else 0 if local_rank == 0: print(vars(args)) if world_size > 1: print('rank: {}/{}'.format(local_rank + 1, world_size)) torch.distributed.init_process_group(backend='gloo', init_method='file://%s' % args.tmpname, rank=local_rank, world_size=world_size) train_dataloader, test_dataloader = load_dataset(args, device, world_size) net = AutoEncoder(input_dim=1900, nlayers=args.nlayers, latent=100).to(device) if world_size > 1: net = torch.nn.parallel.DistributedDataParallel(net) if args.modelfile: net.load_state_dict(torch.load(args.modelfile)) # define our optimizer and loss function optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay) loss_func = nn.MSELoss(reduction='mean') test_losses = [] for epoch in range(args.epochs): epoch_start = timeit.default_timer() train(train_dataloader, net, optimizer, loss_func, epoch) test_loss = test(test_dataloader, net, loss_func) print(' %5.2f sec' % (timeit.default_timer() - epoch_start)) test_losses.append(test_loss) if test_loss <= min(test_losses): torch.save(net.state_dict(), 'model/%5.3f.pth' % min(test_losses))
def main(args): # ensures that weight initializations are all the same torch.manual_seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) logging = utils.Logger(args.global_rank, args.save) writer = utils.Writer(args.global_rank, args.save) # Get data loaders. train_queue, valid_queue, num_classes, _ = datasets.get_loaders(args) args.num_total_iter = len(train_queue) * args.epochs warmup_iters = len(train_queue) * args.warmup_epochs swa_start = len(train_queue) * (args.epochs - 1) arch_instance = utils.get_arch_cells(args.arch_instance) model = AutoEncoder(args, writer, arch_instance) model = model.cuda() logging.info('args = %s', args) logging.info('param size = %fM ', utils.count_parameters_in_M(model)) logging.info('groups per scale: %s, total_groups: %d', model.groups_per_scale, sum(model.groups_per_scale)) if args.fast_adamax: # Fast adamax has the same functionality as torch.optim.Adamax, except it is faster. cnn_optimizer = Adamax(model.parameters(), args.learning_rate, weight_decay=args.weight_decay, eps=1e-3) else: cnn_optimizer = torch.optim.Adamax(model.parameters(), args.learning_rate, weight_decay=args.weight_decay, eps=1e-3) cnn_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( cnn_optimizer, float(args.epochs - args.warmup_epochs - 1), eta_min=args.learning_rate_min) grad_scalar = GradScaler(2**10) num_output = utils.num_output(args.dataset, args) bpd_coeff = 1. / np.log(2.) / num_output # if load checkpoint_file = os.path.join(args.save, 'checkpoint.pt') if args.cont_training: logging.info('loading the model.') checkpoint = torch.load(checkpoint_file, map_location='cpu') init_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) model = model.cuda() cnn_optimizer.load_state_dict(checkpoint['optimizer']) grad_scalar.load_state_dict(checkpoint['grad_scalar']) cnn_scheduler.load_state_dict(checkpoint['scheduler']) global_step = checkpoint['global_step'] else: global_step, init_epoch = 0, 0 for epoch in range(init_epoch, args.epochs): # update lrs. if args.distributed: train_queue.sampler.set_epoch(global_step + args.seed) valid_queue.sampler.set_epoch(0) if epoch > args.warmup_epochs: cnn_scheduler.step() # Logging. logging.info('epoch %d', epoch) # Training. train_nelbo, global_step = train(train_queue, model, cnn_optimizer, grad_scalar, global_step, warmup_iters, writer, logging) logging.info('train_nelbo %f', train_nelbo) writer.add_scalar('train/nelbo', train_nelbo, global_step) model.eval() # generate samples less frequently eval_freq = 1 if args.epochs <= 50 else 20 if epoch % eval_freq == 0 or epoch == (args.epochs - 1): with torch.no_grad(): num_samples = 16 n = int(np.floor(np.sqrt(num_samples))) for t in [0.7, 0.8, 0.9, 1.0]: logits = model.sample(num_samples, t) output = model.decoder_output(logits) output_img = output.mean if isinstance( output, torch.distributions.bernoulli.Bernoulli ) else output.sample(t) output_tiled = utils.tile_image(output_img, n) writer.add_image('generated_%0.1f' % t, output_tiled, global_step) valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=10, args=args, logging=logging) logging.info('valid_nelbo %f', valid_nelbo) logging.info('valid neg log p %f', valid_neg_log_p) logging.info('valid bpd elbo %f', valid_nelbo * bpd_coeff) logging.info('valid bpd log p %f', valid_neg_log_p * bpd_coeff) writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch) writer.add_scalar('val/nelbo', valid_nelbo, epoch) writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch) writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch) save_freq = int(np.ceil(args.epochs / 100)) if epoch % save_freq == 0 or epoch == (args.epochs - 1): if args.global_rank == 0: logging.info('saving the model.') torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': cnn_optimizer.state_dict(), 'global_step': global_step, 'args': args, 'arch_instance': arch_instance, 'scheduler': cnn_scheduler.state_dict(), 'grad_scalar': grad_scalar.state_dict() }, checkpoint_file) # Final validation valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=1000, args=args, logging=logging) logging.info('final valid nelbo %f', valid_nelbo) logging.info('final valid neg log p %f', valid_neg_log_p) writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch + 1) writer.add_scalar('val/nelbo', valid_nelbo, epoch + 1) writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch + 1) writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch + 1) writer.close()
def main(): cuda_available = torch.cuda.is_available() train_params, dataset_params = get_arguments() net = AutoEncoder() epoch_trained = 0 if train_params['restore_model']: net = load_model(net, train_params['restore_dir'], train_params['restore_model']) if net is None: print("Initialize network and train from scratch.") net = AutoEncoder() else: epoch_trained = 0 train_loader, validation = audio_data_loader(**dataset_params) if cuda_available is False: warnings.warn( "Cuda is not avalable, can not train model using multi-gpu.") if cuda_available: # Remove train_params["device_ids"] for single GPU if train_params["device_ids"]: batch_size = dataset_params["batch_size"] num_gpu = len(train_params["device_ids"]) assert batch_size % num_gpu == 0 net = nn.DataParallel(net, device_ids=train_params['device_ids']) torch.backends.cudnn.benchmark = True net = net.cuda() criterion = nn.MSELoss() optimizer = get_optimizer(net, train_params['optimizer'], train_params['learning_rate'], train_params['momentum']) if cuda_available: criterion = criterion.cuda() if not os.path.exists(train_params['log_dir']): os.makedirs(train_params['log_dir']) if not os.path.exists(train_params['restore_dir']): os.makedirs(train_params['restore_dir']) train_loss_log_file = open(train_params['log_dir'] + 'train_loss_log.log', 'a') test_loss_log_file = open(train_params['log_dir'] + 'test_loss_log.log', 'a') # Add print for start of training time time = str(datetime.now()) line = 'Training Started at' + str(time) + ' !!! \n' train_loss_log_file.writelines(line) train_loss_log_file.flush() # Keep track of losses train_losses = [] eval_losses = [] best_eval = float('inf') # Begin! for epoch in range(train_params['num_epochs']): train(net, criterion, optimizer, train_losses, train_params, train_loss_log_file, train_loader, cuda_available) eval_loss = evaluate(net, criterion, epoch, eval_losses, validation, test_loss_log_file, cuda_available) if eval_loss < best_eval: save_model(net, 1, train_params['restore_dir']) torch.save(net.state_dict(), train_params['restore_dir'] + 'bestmodel.pth') best_eval = eval_loss save_model(net, epoch_trained + epoch + 1, train_params['restore_dir']) torch.save([train_losses, eval_losses, epoch], train_params['restore_dir'] + 'data_params') # Add print for end of training time time = str(datetime.now()) line = 'Training Ended at' + str(time) + ' !!! \n' train_loss_log_file.writelines(line) train_loss_log_file.flush() train_loss_log_file.close() test_loss_log_file.close()
shuffle=True) dataloader_valid = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True) for epoch in range(50): train_loss = [] valid_loss = [] for data in tqdm(dataloader_train): img = data[0] img = img.view(img.shape[0], -1) output = model(img) loss = criterion(output, img) train_loss.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() for data in tqdm(dataloader_valid): img = data[0] img = img.view(img.shape[0], -1) output = model(img) loss = criterion(output, img) valid_loss.append(loss.item()) print(f"epoch: {epoch}, Train Loss: {np.mean(train_loss)}") print(f"epoch: {epoch}, Valid Loss: {np.mean(valid_loss)}") writer.add_scalar("Train loss", np.mean(train_loss), epoch) writer.add_scalar("Valid loss", np.mean(valid_loss), epoch) torch.save(model.state_dict(), './autoencoder.pth')
for i, data in enumerate(val_dataloader, 1): partial_input, coarse_gt, dense_gt = data partial_input = partial_input.to(DEVICE) coarse_gt = coarse_gt.to(DEVICE) dense_gt = dense_gt.to(DEVICE) partial_input = partial_input.permute(0, 2, 1) v, y_coarse, y_detail = network(partial_input) y_coarse = y_coarse.permute(0, 2, 1) y_detail = y_detail.permute(0, 2, 1) loss = loss_d1(coarse_gt, y_coarse) + args.alpha * loss_d2( dense_gt, y_detail) total_loss += loss.item() iter_count += 1 mean_loss = total_loss / iter_count print("\033[31mValidation epoch {}/{}, loss is {}\033[0m".format( epoch, args.epochs, mean_loss)) # records the best model and epoch if mean_loss < minimum_loss: best_epoch = epoch minimum_loss = mean_loss torch.save(network.state_dict(), args.log_dir + '/lowest_loss.pth') print("\033[31mBest model (lowest loss) in epoch {}\033[0m".format( best_epoch))
# Load data data_loader_train = load_data(args.data_dir, args.batch_size) lowest_loss = float("inf") history_train_loss = [] try: for epoch in range(args.epochs): t0 = time.time() train_loss = train(model, data_loader_train, args.device) print("\nTraining Epoch: %d, Train Loss: %.4f, Elapsed: %.1fs" % (epoch + 1, train_loss, time.time() - t0)) history_train_loss.append(train_loss) if train_loss < lowest_loss: torch.save(model.state_dict(), 'weights_unsup.pth') print("Weight Saved") lowest_loss = train_loss exp_lr_scheduler.step(train_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') plt.plot(history_train_loss, label='Train Loss') plt.title('Training Loss') plt.legend() plt.show()
def main(): parser = argparse.ArgumentParser(description='AvatarNet by Pytorch') parser.add_argument('--batch_size', '-b', type=int, default=4, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=2, help='Number of sweeps over the dataset to train') parser.add_argument('--patch_size', '-p', type=int, default=5, help='Size of extracted patches from style features') parser.add_argument('--alpha', '-a', type=float, default=0.8, help='alpha control the fusion degree') parser.add_argument('--lam1', type=float, default=0.01, help='lambda1 for perceptual loss') parser.add_argument('--lam2', type=float, default=0.01, help='lambda2 for tv loss') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID(nagative value indicate CPU)') parser.add_argument('--learning_rate', '-lr', type=float, default=1e-4, help='learning rate for Adam') parser.add_argument('--snapshot_interval', type=int, default=10, help='Interval of snapshot to generate image') parser.add_argument('--train_content_dir', type=str, default='/data/chen/content', help='content images directory for train') parser.add_argument('--train_style_dir', type=str, default='/data/chen/style', help='style images directory for train') parser.add_argument('--test_content_dir', type=str, default='/data/chen/content', help='content images directory for test') parser.add_argument('--test_style_dir', type=str, default='/data/chen/style', help='style images directory for test') parser.add_argument('--save_dir', type=str, default='result', help='save directory for result and loss') parser.add_argument('--reuse', default=None, help='model state path to load for reuse') args = parser.parse_args() # create directory to save if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) loss_dir = f'{args.save_dir}/loss' model_state_dir = f'{args.save_dir}/model_state' image_dir = f'{args.save_dir}/image' if not os.path.exists(loss_dir): os.mkdir(loss_dir) os.mkdir(model_state_dir) os.mkdir(image_dir) # set device on GPU if available, else CPU if torch.cuda.is_available() and args.gpu >= 0: device = torch.device(f'cuda:{args.gpu}') print(f'# CUDA available: {torch.cuda.get_device_name(0)}') else: device = 'cpu' print(f'# Minibatch-size: {args.batch_size}') print(f'# epoch: {args.epoch}') print('') # prepare dataset and dataLoader train_dataset = PreprocessDataset(args.train_content_dir, args.train_style_dir) test_dataset = PreprocessDataset(args.test_content_dir, args.test_style_dir) iters = len(train_dataset) print(f'Length of train image pairs: {iters}') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) test_iter = iter(test_loader) # set model and optimizer model = AutoEncoder().to(device) if args.reuse is not None: model.load_state_dict(torch.load(args.reuse)) optimizer = Adam(model.parameters(), lr=args.learning_rate) # start training loss_list = [] for e in range(1, args.epoch + 1): print(f'Start {e} epoch') for i, (content, style) in tqdm(enumerate(train_loader, 1)): content = content.to(device) style = style.to(device) loss = model(content, style, args.patch_size, args.alpha, args.lam1, args.lam2) loss_list.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print( f'[{e}/total {args.epoch} epoch],[{i} /' f'total {round(iters/args.batch_size)} iteration]: {loss.item()}' ) if i % args.snapshot_interval == 0: content, style = next(test_iter) content = content.to(device) style = style.to(device) with torch.no_grad(): out = model.generate(content, style, args.patch_size, args.alpha) content = denorm(content, device) style = denorm(style, device) out = denorm(out, device) res = torch.cat([content, style, out], dim=0) res = res.to('cpu') save_image(res, f'{image_dir}/{e}_epoch_{i}_iteration.png', nrow=args.batch_size) torch.save(model.state_dict(), f'{model_state_dir}/{e}_epoch.pth') plt.plot(range(len(loss_list)), loss_list) plt.xlabel('iteration') plt.ylabel('loss') plt.title('train loss') plt.savefig(f'{loss_dir}/train_loss.png') with open(f'{loss_dir}/loss_log.txt', 'w') as f: for l in loss_list: f.write(f'{l}\n') print(f'Loss saved in {loss_dir}')
class Trainer(object): def __init__(self, train_loader, test_loader, config): self.train_loader = train_loader self.test_loader = test_loader self.config = config self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.num_epochs = config.num_epochs self.lr = config.lr self.in_channel = config.in_channel self.image_size = config.image_size self.hidden_dim = config.hidden_dim self.output_dim = config.output_dim self.log_interval = config.log_interval self.sample_interval = config.sample_interval self.ckpt_interval = config.ckpt_interval self.sample_folder = config.sample_folder self.ckpt_folder = config.ckpt_folder self.build_net() self.vis = Visualizer() def build_net(self): # define network self.net = AutoEncoder(self.in_channel, self.image_size, self.hidden_dim, self.output_dim) if self.config.mode == 'test' and self.config.training_path == '': print("[*] Enter model path!") exit() # if training model exists if self.config.training_path != '': self.net.load_state_dict( torch.load(self.config.training_path, map_location=lambda storage, loc: storage)) print("[*] Load weight from {}!".format(self.config.training_path)) self.net.to(self.device) # add noise to image def add_noise(self, imgs): noise = torch.randn(imgs.size()) * 0.4 noisy_imgs = noise + imgs return noisy_imgs def train(self): # define loss function bce_criterion = nn.BCELoss().to(self.device) mse_criterion = nn.MSELoss().to(self.device) # define optimizer optimizer = Adam(self.net.parameters(), self.lr) step = 0 print("[*] Learning started!") # get fixed sample temp_iter = iter(self.train_loader) fixed_imgs, _ = next(temp_iter) fixed_imgs = fixed_imgs.to(self.device) # save fixed sample image x_path = os.path.join(self.sample_folder, 'fixed_input.png') save_image(fixed_imgs, x_path, normalize=True) print("[*] Save fixed input image!") # make fixed noisy sample and save fixed_noisy_imgs = self.add_noise(fixed_imgs) noisy_x_path = os.path.join(self.sample_folder, 'fixed_noisy_input.png') save_image(fixed_noisy_imgs, noisy_x_path, normalize=True) print("[*] Save fixed noisy input image!") # flatten data tensors fixed_imgs = fixed_imgs.view(fixed_imgs.size(0), -1) fixed_noisy_imgs = fixed_noisy_imgs.view(fixed_imgs.size(0), -1) for epoch in range(self.num_epochs): for i, (imgs, _) in enumerate(self.train_loader): self.net.train() imgs = imgs.view(imgs.size(0), -1) # original images noisy_imgs = self.add_noise(imgs) # add noise noisy_imgs = noisy_imgs.to(self.device) # forwarding outputs = self.net(noisy_imgs) # use noisy image as input bce_loss = bce_criterion(outputs, imgs) mse_loss = mse_criterion(outputs, imgs) # backwarding optimizer.zero_grad() bce_loss.backward() # backward BCE loss optimizer.step() # do logging if (step + 1) % self.log_interval == 0: print("[{}/{}] [{}/{}] BCE loss: {:3f}, MSE loss:{:3f}". format(epoch + 1, self.num_epochs, i + 1, len(self.train_loader), bce_loss.item() / len(imgs), mse_loss.item() / len(imgs))) self.vis.plot("BCE Loss plot", bce_loss.item() / len(imgs)) self.vis.plot("MSE Loss plot", mse_loss.item() / len(imgs)) # do sampling if (step + 1) % self.sample_interval == 0: outputs = self.net(fixed_noisy_imgs) x_hat = outputs.cpu().data.view(outputs.size(0), -1, self.image_size, self.image_size) x_hat_path = os.path.join( self.sample_folder, 'output_epoch{}.png'.format(epoch + 1)) save_image(x_hat, x_hat_path, normalize=True) print("[*] Save sample images!") step += 1 if (epoch + 1) % self.ckpt_interval == 0: ckpt_path = os.path.join(self.ckpt_folder, 'ckpt_epoch{}.pth'.format(epoch + 1)) torch.save(self.net.state_dict(), ckpt_path) print("[*] Checkpoint saved!") print("[*] Learning finished!") ckpt_path = os.path.join(self.ckpt_folder, 'final_model.pth') torch.save(self.net.state_dict(), ckpt_path) print("[*] Final weight saved!")
loss.backward() optimizer.step() total_loss += loss.item() total += 1 pbar.update(1) train_loss = total_loss / total network.eval() total_loss = 0 total = 0 with torch.no_grad(): with tqdm(total=test_gen.get_total_samples() / batch_size) as pbar: for audios, labels in test_gen.generator(): if np.min(audios) != 0 and np.max(audios) != 0 and np.min(labels) != 0 and np.max(labels) != 0: audios = audios / 60 labels = labels / 60 audios = torch.from_numpy(audios).float().cuda() labels = torch.from_numpy(labels).float().cuda() outputs = network.forward(audios) loss = criterion(outputs, labels) total_loss += loss.item() total += 1 test_loss = total_loss / total torch.save(network.state_dict(), "model.pt") print("epoch: ", epoch, "train loss: ", train_loss, "test loss: ", test_loss)
d_loss = dis_loss(real_dis, validity) optimizer_b.zero_grad() d_loss.backward(retain_graph=True) optimizer_b.step() return _loss print('training for {} steps'.format(args.n_steps)) for epoch in range(args.n_steps): # for idx, (images, _) in enumerate(dataloader): a = next(itera) b = next(iterb) images_a = torch.tensor(a, device=device).float() images_a = images_a.to(device) images_b = torch.tensor(b, device=device).float() images_b = images_b.to(device) loss_a = train_step(images_a, version='a') loss_b = train_step(images_b, version='b') to_print = "Epoch[{}/{}] Loss A:{}, Loss B:{}".format(epoch+1, args.n_steps, loss_a.data, loss_b.data) if epoch % 1000 == 0: print(to_print) model_state_dict = model.state_dict() torch.save(model_state_dict, '{}/{}.pt'.format(args.saved_dir, args.model_name)) if save: model_state_dict = model.state_dict() torch.save(model_state_dict, '{}/model.pt'.format(args.saved_dir)) else: model.load_state_dict(torch.load('{}/model.pt'.format(args.saved_dir)))
def train(self, config): """Training routine""" # Initialize datasets for both training and validation train_data = torchvision.datasets.ImageFolder( root=os.path.join(config.data_dir, "train"), transform=torchvision.transforms.ToTensor()) valid_data = torchvision.datasets.ImageFolder( root=os.path.join(config.data_dir, "valid"), transform=torchvision.transforms.ToTensor()) # Create data loader for training and validation. tr_data_loader = torch.utils.data.DataLoader( dataset=train_data, batch_size=config.batch_size, num_workers=config.numWorker, shuffle=True) va_data_loader = torch.utils.data.DataLoader( dataset=valid_data, batch_size=config.batch_size, num_workers=config.numWorker, shuffle=False) # Create model instance. #model = Model() model = AutoEncoder() # Move model to gpu if cuda is available if torch.cuda.is_available(): model = model.cuda() # Make sure that the model is set for training model.train() # Create loss objects data_loss = nn.MSELoss() # Create optimizier optimizer = optim.Adam(model.parameters(), lr=config.learn_rate) # No need to move the optimizer (as of PyTorch 1.0), it lies in the same # space as the model # Create summary writer tr_writer = SummaryWriter( log_dir=os.path.join(config.log_dir, "train")) va_writer = SummaryWriter( log_dir=os.path.join(config.log_dir, "valid")) # Create log directory and save directory if it does not exist if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) # Initialize training iter_idx = -1 # make counter start at zero best_va_acc = 0 # to check if best validation accuracy # Prepare checkpoint file and model file to save and load from checkpoint_file = os.path.join(config.save_dir, "checkpoint.pth") bestmodel_file = os.path.join(config.save_dir, "best_model.pth") # Check for existing training results. If it existst, and the configuration # is set to resume `config.resume==True`, resume from previous training. If # not, delete existing checkpoint. if os.path.exists(checkpoint_file): if config.resume: # Use `torch.load` to load the checkpoint file and the load the # things that are required to continue training. For the model and # the optimizer, use `load_state_dict`. It's actually a good idea # to code the saving part first and then code this part. print("Checkpoint found! Resuming") # TODO proper logging # Read checkpoint file. # Fix gpu -> cpu bug compute_device = 'cuda' if torch.cuda.is_available() else 'cpu' load_res = torch.load(checkpoint_file, map_location=compute_device) # Resume iterations iter_idx = load_res["iter_idx"] # Resume best va result best_va_acc = load_res["best_va_acc"] # Resume model model.load_state_dict(load_res["model"]) # Resume optimizer optimizer.load_state_dict(load_res["optimizer"]) # Note that we do not resume the epoch, since we will never be able # to properly recover the shuffling, unless we remember the random # seed, for example. For simplicity, we will simply ignore this, # and run `config.num_epoch` epochs regardless of resuming. else: os.remove(checkpoint_file) # Training loop for epoch in range(config.num_epoch): # For each iteration prefix = "Training Epoch {:3d}: ".format(epoch) for data in tqdm(tr_data_loader, desc=prefix): # Counter iter_idx += 1 # Split the data # x is img, y is label x, y = data #print(x) # Send data to GPU if we have one if torch.cuda.is_available(): x = x.cuda() y = y.cuda() # Apply the model to obtain scores (forward pass) logits = model.forward(x) # Compute the loss loss = data_loss(logits, x.float()) # Compute gradients loss.backward() # Update parameters optimizer.step() # Zero the parameter gradients in the optimizer optimizer.zero_grad() # Monitor results every report interval if iter_idx % config.rep_intv == 0: # Compute accuracy (No gradients required). We'll wrapp this # part so that we prevent torch from computing gradients. with torch.no_grad(): pred = torch.argmax(logits, dim=1) acc = torch.mean( torch.eq(pred.view(x.size()), x).float()) * 100.0 # Write loss and accuracy to tensorboard, using keywords `loss` # and `accuracy`. tr_writer.add_scalar("loss", loss, global_step=iter_idx) tr_writer.add_scalar("accuracy", acc, global_step=iter_idx) # Save torch.save( { "iter_idx": iter_idx, "best_va_acc": best_va_acc, "model": model.state_dict(), "optimizer": optimizer.state_dict(), "loss": loss, "epoch": epoch, "acc": acc }, checkpoint_file) # Validate results every validation interval if iter_idx % config.val_intv == 0: # List to contain all losses and accuracies for all the # training batches va_loss = [] va_acc = [] # Set model for evaluation model = model.eval() for data in va_data_loader: # Split the data x, y = data # Send data to GPU if we have one if torch.cuda.is_available(): x = x.cuda() y = y.cuda() # Apply forward pass to compute the losses # and accuracies for each of the validation batches with torch.no_grad(): # Compute logits logits = model.forward(x) # Compute loss and store as numpy loss = data_loss(logits, x.float()) va_loss += [loss.cpu().numpy()] # Compute accuracy and store as numpy pred = torch.argmax(logits, dim=1) acc = torch.mean( torch.eq(pred.view(x.size()), x).float()) * 100.0 va_acc += [acc.cpu().numpy()] # Set model back for training model = model.train() # Take average va_loss = np.mean(va_loss) va_acc = np.mean(va_acc) # Write to tensorboard using `va_writer` va_writer.add_scalar("loss", va_loss, global_step=iter_idx) va_writer.add_scalar("accuracy", va_acc, global_step=iter_idx) # Check if best accuracy if va_acc > best_va_acc: best_va_acc = va_acc # Save best model using torch.save. Similar to previous # save but at location defined by `bestmodel_file` torch.save( { "iter_idx": iter_idx, "best_va_acc": best_va_acc, "model": model.state_dict(), "optimizer": optimizer.state_dict(), "loss": loss, "acc": acc }, bestmodel_file)