def main(): global args, best_result, output_directory # set random seed torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) np.random.seed(args.manual_seed) random.seed(args.manual_seed) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") args.batch_size = args.batch_size * torch.cuda.device_count() else: print("Let's use GPU ", torch.cuda.current_device()) train_loader, val_loader = create_loader(args) if args.resume: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] optimizer = checkpoint['optimizer'] # solve 'out of memory' model = checkpoint['model'] print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) # clear memory del checkpoint # del model_dict torch.cuda.empty_cache() else: print("=> creating Model") model = get_models(args.dataset, pretrained=False) print("=> model created.") start_epoch = 0 # different modules have different learning rate train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] optimizer = torch.optim.SGD(train_params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # You can use DataParallel() whether you use Multi-GPUs or not model = nn.DataParallel(model).cuda() # when training, use reduceLROnPlateau to reduce learning rate scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=args.lr_patience) # loss function criterion = criteria.ordLoss() # create directory path output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) best_txt = os.path.join(output_directory, 'best.txt') config_txt = os.path.join(output_directory, 'config.txt') # write training parameters to config file if not os.path.exists(config_txt): with open(config_txt, 'w') as txtfile: args_ = vars(args) args_str = '' for k, v in args_.items(): args_str = args_str + str(k) + ':' + str(v) + ',\t\n' txtfile.write(args_str) # create log log_path = os.path.join( output_directory, 'logs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) logger = SummaryWriter(log_path) for epoch in range(start_epoch, args.epochs): # remember change of the learning rate for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) logger.add_scalar('Lr/lr_' + str(i), old_lr, epoch) train(train_loader, model, criterion, optimizer, epoch, logger) # train for one epoch result, img_merge = validate(val_loader, model, epoch, logger) # evaluate on validation set # remember best rmse and save checkpoint is_best = result.rmse < best_result.rmse if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write( "epoch={}, rmse={:.3f}, rml={:.3f}, log10={:.3f}, d1={:.3f}, d2={:.3f}, dd31={:.3f}, " "t_gpu={:.4f}".format(epoch, result.rmse, result.absrel, result.lg10, result.delta1, result.delta2, result.delta3, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) # save checkpoint for each epoch utils.save_checkpoint( { 'args': args, 'epoch': epoch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch, output_directory) # when rml doesn't fall, reduce learning rate scheduler.step(result.absrel) logger.close()
def main(): global args, best_result, output_directory # set random seed torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) np.random.seed(args.manual_seed) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") args.batch_size = args.batch_size * torch.cuda.device_count() else: print("Let's use GPU ", torch.cuda.current_device()) train_loader, val_loader = create_loader(args) if args.mode == 'test': if args.resume: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_result = checkpoint['best_result'] # solve 'out of memory' model = checkpoint['model'] print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) # clear memory del checkpoint # del model_dict torch.cuda.empty_cache() else: print("no trained model to test.") result, img_merge = validate(args, val_loader, model, epoch, logger=None) print( 'Test Result: mean iou={result.mean_iou:.3f}, mean acc={result.mean_acc:.3f}.' .format(result=result)) elif args.mode == 'train': if args.resume: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_iter = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] optimizer = checkpoint['optimizer'] # solve 'out of memory' model = checkpoint['model'] print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) # clear memory del checkpoint # del model_dict torch.cuda.empty_cache() else: print("=> creating Model") model = get_models(args) print("=> model created.") start_iter = 1 # different modules have different learning rate train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] print(train_params) optimizer = torch.optim.SGD(train_params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # You can use DataParallel() whether you use Multi-GPUs or not model = nn.DataParallel(model).cuda() scheduler = PolynomialLR(optimizer=optimizer, step_size=args.lr_decay, iter_max=args.max_iter, power=args.power) # loss function criterion = criteria._CrossEntropyLoss2d(size_average=True, batch_average=True) # create directory path output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) best_txt = os.path.join(output_directory, 'best.txt') config_txt = os.path.join(output_directory, 'config.txt') # write training parameters to config file if not os.path.exists(config_txt): with open(config_txt, 'w') as txtfile: args_ = vars(args) args_str = '' for k, v in args_.items(): args_str = args_str + str(k) + ':' + str(v) + ',\t\n' txtfile.write(args_str) # create log log_path = os.path.join( output_directory, 'logs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) logger = SummaryWriter(log_path) # train model.train() if args.freeze: model.module.freeze_backbone_bn() output_directory = utils.get_output_directory(args, check=True) average_meter = AverageMeter() for it in tqdm(range(start_iter, args.max_iter + 1), total=args.max_iter, leave=False, dynamic_ncols=True): # for it in range(1, args.max_iter + 1): # Clear gradients (ready to accumulate) optimizer.zero_grad() loss = 0 data_time = 0 gpu_time = 0 for _ in range(args.iter_size): end = time.time() try: samples = next(loader_iter) except: loader_iter = iter(train_loader) samples = next(loader_iter) input = samples['image'].cuda() target = samples['label'].cuda() torch.cuda.synchronize() data_time_ = time.time() data_time += data_time_ - end with torch.autograd.detect_anomaly(): preds = model(input) # @wx 注意输出 # print('#train preds size:', len(preds)) # print('#train preds[0] size:', preds[0].size()) iter_loss = 0 if args.msc: for pred in preds: # Resize labels for {100%, 75%, 50%, Max} logits target_ = utils.resize_labels( target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) iter_loss += criterion(pred, target_) else: pred = preds target_ = utils.resize_labels(target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) # print('#train target size:', target.size()) iter_loss += criterion(pred, target_) # Backpropagate (just compute gradients wrt the loss) iter_loss /= args.iter_size iter_loss.backward() loss += float(iter_loss) gpu_time += time.time() - data_time_ torch.cuda.synchronize() # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=it) # measure accuracy and record loss result = Result() pred = F.softmax(pred, dim=1) result.evaluate(pred.data.cpu().numpy(), target.data.cpu().numpy(), n_class=21) average_meter.update(result, gpu_time, data_time, input.size(0)) if it % args.print_freq == 0: print('=> output: {}'.format(output_directory)) print('Train Iter: [{0}/{1}]\t' 't_Data={data_time:.3f}({average.data_time:.3f}) ' 't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t' 'Loss={Loss:.5f} ' 'MeanAcc={result.mean_acc:.3f}({average.mean_acc:.3f}) ' 'MIOU={result.mean_iou:.3f}({average.mean_iou:.3f}) '. format(it, args.max_iter, data_time=data_time, gpu_time=gpu_time, Loss=loss, result=result, average=average_meter.average())) logger.add_scalar('Train/Loss', loss, it) logger.add_scalar('Train/mean_acc', result.mean_iou, it) logger.add_scalar('Train/mean_iou', result.mean_acc, it) for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) logger.add_scalar('Lr/lr_' + str(i), old_lr, it) if it % args.iter_save == 0: resu1t, img_merge = validate(args, val_loader, model, epoch=it, logger=logger) # remember best rmse and save checkpoint is_best = result.mean_iou < best_result.mean_iou if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write( "Iter={}, mean_iou={:.3f}, mean_acc={:.3f}" "t_gpu={:.4f}".format(it, result.mean_iou, result.mean_acc, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) # save checkpoint for each epoch utils.save_checkpoint( { 'args': args, 'epoch': it, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, it, output_directory) # change to train mode model.train() if args.freeze: model.module.freeze_backbone_bn() logger.close() else: print('no mode named as ', args.mode) exit(-1)