x1 = schedule.forward(input_, *list(model.state_dict(keep_vars=True).values())) print( 'Forward mean absolute difference', abs(x0[0] - x1).mean() if 'googlenet' in args.model else abs(x0 - x1).mean()) schedule.backward(-torch.ones_like(x1)) print('Gradient of normal model') gradient_diff = [ "{:.5f} {} {}".format(float(v.grad.mean()), n, v.shape) for n, v in model.named_parameters() if v.grad is not None ] for gd in gradient_diff: print(gd) exit() if args.run_bs: graph = Graph.create(model, input_shape=(3, height, width)) model.cuda() solvert = -1 bs = int(args.bs) print("Solver trying batch size %d" % bs) if len(args.solution_file) > 0: solver_info, solution = load_solution(args.solution_file) else: input_ = torch.randn((bs, 3, height, width)).cuda()
def train_eval_model(opts): # parse model configuration num_epochs = opts["num_epochs"] train_batch_size = opts["train_batch_size"] val_batch_size = opts["eval_batch_size"] dataset_type = opts["dataset_type"] opti_mode = opts["optimizer"] loss_criterion = opts["loss_criterion"] lr = opts["lr"] lr_decay = opts["lr_decay"] wd = opts["weight_decay"] gpus = opts["gpu_list"].split(',') os.environ['CUDA_VISIBLE_DEVICE'] = opts["gpu_list"] train_dir = opts["log_dir"] train_data_dir = opts["train_data_dir"] eval_data_dir = opts["eval_data_dir"] pretrained = opts["pretrained_model"] resume = opts["resume"] display_iter = opts["display_iter"] save_epoch = opts["save_every_epoch"] show = opts["vis"] # backup train configs log_file = os.path.join(train_dir, "log_file.txt") os.makedirs(train_dir, exist_ok=True) model_dir = os.path.join(train_dir, "code_backup") os.makedirs(model_dir, exist_ok=True) if resume is None and os.path.exists(log_file): os.remove(log_file) shutil.copy("./models/unet.py", os.path.join(model_dir, "unet.py")) shutil.copy("./trainer_unet.py", os.path.join(model_dir, "trainer_unet.py")) shutil.copy("./datasets/dataset.py", os.path.join(model_dir, "dataset.py")) ckt_dir = os.path.join(train_dir, "checkpoints") os.makedirs(ckt_dir, exist_ok=True) # format printing configs print("*" * 50) table_key = [] table_value = [] n = 0 for key, value in opts.items(): table_key.append(key) table_value.append(str(value)) n += 1 print_table([table_key, ["="] * n, table_value]) # format gpu list gpu_list = [] for str_id in gpus: id = int(str_id) gpu_list.append(id) # dataloader print("==> Create dataloader") dataloaders_dict = { "train": er_data_loader(train_data_dir, train_batch_size, dataset_type, is_train=True), "eval": er_data_loader(eval_data_dir, val_batch_size, dataset_type, is_train=False) } # define parameters of two networks print("==> Create network") num_channels = 1 num_classes = 1 model = UNet(num_channels, num_classes) init_weights(model) # loss layer criterion = create_criterion(criterion=loss_criterion) best_acc = 0.0 start_epoch = 0 # load pretrained model if pretrained is not None and os.path.isfile(pretrained): print("==> Train from model '{}'".format(pretrained)) checkpoint_gan = torch.load(pretrained) model.load_state_dict(checkpoint_gan['model_state_dict']) print("==> Loaded checkpoint '{}')".format(pretrained)) for param in model.parameters(): param.requires_grad = False # resume training elif resume is not None and os.path.isfile(resume): print("==> Resume from checkpoint '{}'".format(resume)) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] + 1 best_acc = checkpoint['best_acc'] model_dict = model.state_dict() pretrained_dict = { k: v for k, v in checkpoint['model_state_dict'].items() if k in model_dict and v.size() == model_dict[k].size() } model_dict.update(pretrained_dict) model.load_state_dict(pretrained_dict) print("==> Loaded checkpoint '{}' (epoch {})".format( resume, checkpoint['epoch'] + 1)) # train from scratch else: print("==> Train from initial or random state.") # define mutiple-gpu mode device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.cuda() model = nn.DataParallel(model) # print learnable parameters print("==> List learnable parameters") for name, param in model.named_parameters(): if param.requires_grad == True: print("\t{}, size {}".format(name, param.size())) params_to_update = [{'params': model.parameters()}] # define optimizer print("==> Create optimizer") optimizer = create_optimizer(params_to_update, opti_mode, lr=lr, momentum=0.9, wd=wd) if resume is not None and os.path.isfile(resume): optimizer.load_state_dict(checkpoint['optimizer']) # start training since = time.time() # Each epoch has a training and validation phase print("==> Start training") total_steps = 0 for epoch in range(start_epoch, num_epochs): print('-' * 50) print("==> Epoch {}/{}".format(epoch + 1, num_epochs)) total_steps = train_one_epoch(epoch, total_steps, dataloaders_dict['train'], model, device, criterion, optimizer, lr, lr_decay, display_iter, log_file, show) epoch_acc, epoch_iou, epoch_f1 = eval_one_epoch( epoch, dataloaders_dict['eval'], model, device, log_file) if best_acc < epoch_acc and epoch >= 5: best_acc = epoch_acc torch.save( { 'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, os.path.join(ckt_dir, "best.pth")) if (epoch + 1) % save_epoch == 0 and (epoch + 1) >= 20: torch.save( { 'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_iou': epoch_iou }, os.path.join(ckt_dir, "checkpoints_" + str(epoch + 1) + ".pth")) time_elapsed = time.time() - since time_message = 'Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60) print(time_message) with open(log_file, "a+") as fid: fid.write('%s\n' % time_message) print('==> Best val Acc: {:4f}'.format(best_acc))