def __init__(self,args): warnings.filterwarnings('ignore') assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(args.backbone, args.dataset, args.exp) if args.dataset == 'pascal': raise NotImplementedError elif args.dataset == 'cityscapes': kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True} dataset_loader, num_classes = dataloaders.make_data_loader(args, **kwargs) args.num_classes = num_classes elif args.dataset == 'marsh' : kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True} dataset_loader,val_loader, test_loader, num_classes = dataloaders.make_data_loader(args, **kwargs) args.num_classes = num_classes else: raise ValueError('Unknown dataset: {}'.format(args.dataset)) if args.backbone == 'autodeeplab': model = Retrain_Autodeeplab(args) model.load_state_dict(torch.load(r"./run/marsh/deeplab-autodeeplab/model_best.pth.tar")['state_dict'], strict=False) else: raise ValueError('Unknown backbone: {}'.format(args.backbone))
def evaluate(): # setup warnings.filterwarnings('ignore') cfg = config_factory['resnet_cityscapes'] args = obtain_retrain_autodeeplab_args() if not args.local_rank == -1: torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:{}'.format( cfg.port), world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(cfg.respth) else: FORMAT = '%(levelname)s %(filename)s(%(lineno)d): %(message)s' log_level = logging.INFO if dist.is_initialized() and dist.get_rank() != 0: log_level = logging.ERROR logging.basicConfig(level=log_level, format=FORMAT, stream=sys.stdout) logger = logging.getLogger() # model logger.info('setup and restore model') net = Retrain_Autodeeplab(args) save_pth = osp.join(cfg.respth, 'model_final.pth') net.load_state_dict(torch.load(save_pth)) net.cuda() net.eval() if not args.local_rank == -1: net = nn.parallel.DistributedDataParallel( net, device_ids=[ args.local_rank, ], output_device=args.local_rank) # evaluator logger.info('compute the mIOU') evaluator = MscEval(cfg, args) mIOU = evaluator(net) logger.info('mIOU is: {:.6f}'.format(mIOU))
def main(): warnings.filterwarnings('ignore') assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True args = obtain_retrain_autodeeplab_args() args.data_dict = {} model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format( args.backbone, args.dataset, args.exp) if args.dataset == 'pascal': raise NotImplementedError elif args.dataset == 'cityscapes': kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } dataset_loader, num_classes = make_data_loader(args, **kwargs) args.num_classes = num_classes elif args.dataset == '2d': args.data_dict, args.num_classes = make_data_loader(args) elif args.dataset == '3d': args.data_dict, args.num_classes = make_data_loader_3d_patch(args) else: raise ValueError('Unknown dataset: {}'.format(args.dataset)) if args.backbone == 'autodeeplab': model = Retrain_Autodeeplab(args) else: raise ValueError('Unknown backbone: {}'.format(args.backbone)) if args.criterion == 'Ohem': args.thresh = 0.7 args.crop_size = [args.crop_size, args.crop_size] if isinstance( args.crop_size, int) else args.crop_size args.n_min = int((args.batch_size / len(args.gpu) * args.crop_size[0] * args.crop_size[1]) // 16) criterion = build_criterion(args) model = nn.DataParallel(model).cuda() model.train() if args.freeze_bn: for m in model.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() m.weight.requires_grad = False m.bias.requires_grad = False optimizer = optim.SGD(model.module.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=0.0001) max_iteration = args.data_dict['num_train'] * args.epochs scheduler = Iter_LR_Scheduler(args, max_iteration, args.data_dict['num_train']) start_epoch = 0 if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {0}'.format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print('=> loaded checkpoint {0} (epoch {1})'.format( args.resume, checkpoint['epoch'])) else: raise ValueError('=> no checkpoint found at {0}'.format( args.resume)) for epoch in range(start_epoch, args.epochs): losses = AverageMeter() for i in range(args.data_dict['num_train']): cur_iter = epoch * args.data_dict['num_train'] + i scheduler(optimizer, cur_iter) inputs = torch.FloatTensor(args.data_dict['train_data'][i]).cuda() target = torch.FloatTensor(args.data_dict['train_mask'][i]).cuda() outputs = model(inputs) loss = criterion(outputs, target) if np.isnan(loss.item()) or np.isinf(loss.item()): pdb.set_trace() losses.update(loss.item(), args.batch_size) loss.backward() optimizer.step() optimizer.zero_grad() print('epoch: {0}\t' 'iter: {1}/{2}\t' 'lr: {3:.6f}\t' 'loss: {loss.val:.4f} ({loss.ema:.4f})'.format( epoch + 1, i + 1, args.data_dict['num_train'], scheduler.get_lr(optimizer), loss=losses)) if epoch < args.epochs - 50: if epoch % 50 == 0: torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, model_fname % (epoch + 1)) else: torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, model_fname % (epoch + 1)) if epoch % 2 == 0: ########################valid and test validation(epoch, model, args, criterion, args.num_classes) validation(epoch, model, args, criterion, args.num_classes, test_tag=True) print('reset local total loss!')
def main(start_epoch, epochs): assert torch.cuda.is_available(), NotImplementedError('No cuda available ') if not osp.exists('data/'): os.mkdir('data/') if not osp.exists('log/'): os.mkdir('log/') args = obtain_evaluate_args() torch.backends.cudnn.benchmark = True model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format( 'autodeeplab', 'cityscapes', 'bnlr7e-3') if args.dataset == 'cityscapes': dataset = CityscapesSegmentation(args=args, root=Path.db_root_dir(args.dataset), split='reval') print(dataset) else: return NotImplementedError if args.backbone == 'autodeeplab': model = Retrain_Autodeeplab(args) else: raise ValueError('Unknown backbone: {}'.format(args.backbone)) if not args.train: val_dataloader = DataLoader(dataset, batch_size=16, shuffle=False) model = torch.nn.DataParallel(model).cuda() print("======================start evaluate=======================") for epoch in range(0, args.epochs): print("evaluate epoch {:}".format(epoch + start_epoch)) checkpoint_name = model_fname % (epoch + start_epoch) print(checkpoint_name) checkpoint = torch.load(checkpoint_name) ##TODO: capire perchè non fanno i due commenti sotto #state_dict = {k[1:]: v for k, v in checkpoint['state_dict'].items() if 'tracked' not in k} #model.module.load_state_dict(state_dict) inter_meter = AverageMeter() union_meter = AverageMeter() for i, sample in enumerate(val_dataloader): inputs, target = sample['image'], sample['label'] #i have to add this line because of CUDA OUT OF MEMORY target = target[:, :200, :400] N, H, W = target.shape total_outputs = torch.zeros( (N, dataset.NUM_CLASSES, H, W)).cuda() with torch.no_grad(): for j, scale in enumerate(args.eval_scales): new_scale = [int(H * scale), int(W * scale)] inputs = F.upsample(inputs, new_scale, mode='bilinear', align_corners=True) inputs = inputs.cuda() outputs = model(inputs) outputs = F.upsample(outputs, (H, W), mode='bilinear', align_corners=True) total_outputs += outputs _, pred = torch.max(total_outputs, 1) pred = pred.detach().cpu().numpy().squeeze().astype( np.uint8) mask = target.numpy().astype(np.uint8) print('eval: {0}/{1}'.format(i + 1, len(val_dataloader))) inter, union = inter_and_union(pred, mask, len(dataset.CLASSES)) inter_meter.update(inter) union_meter.update(union) iou = inter_meter.sum / (union_meter.sum + 1e-10) miou = 'epoch: {0} Mean IoU: {1:.2f}'.format( epoch, iou.mean() * 100) f = open('log/result.txt', 'a') for i, val in enumerate(iou): class_iou = 'IoU {0}: {1:.2f}\n'.format( dataset.CLASSES[i], val * 100) f.write(class_iou) f.write('\n') f.write(miou) f.write('\n') f.close()
def main(): args = obtain_retrain_autodeeplab_args() torch.cuda.set_device(args.local_rank) cfg = config_factory['resnet_cityscapes'] if not os.path.exists(cfg.respth): os.makedirs(cfg.respth) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:{}'.format(cfg.port), world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(cfg.respth) logger = logging.getLogger() rand_seed = random.randint(0, args.manualSeed) prepare_seed(rand_seed) if args.local_rank == 0: log_string = 'seed-{}-time-{}'.format(rand_seed, time_for_file()) train_logger = Logger(args, log_string) train_logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): train_logger.log('{:16} : {:}'.format(name, value)) train_logger.log("Python version : {}".format( sys.version.replace('\n', ' '))) train_logger.log("Pillow version : {}".format(PIL.__version__)) train_logger.log("PyTorch version : {}".format(torch.__version__)) train_logger.log("cuDNN version : {}".format( torch.backends.cudnn.version())) train_logger.log("random_seed : {}".format(rand_seed)) if args.checkname is None: args.checkname = 'deeplab-' + str(args.backbone) # dataset kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } train_loader, args.num_classes, sampler = make_data_loader(args=args, **kwargs) # model model = Retrain_Autodeeplab(args) model.train() model.cuda() model = nn.parallel.DistributedDataParallel( model, device_ids=[ args.local_rank, ], output_device=args.local_rank, find_unused_parameters=True).cuda() n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16 criterion = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda() max_iteration = int(cfg.max_epoch * len(train_loader)) # max_iteration = int(1500000 * 4 // cfg.gpus) it = 0 # optimizer optimizer = Optimizer(model, cfg.lr_start, cfg.momentum, cfg.weight_decay, cfg.warmup_steps, cfg.warmup_start_lr, max_iteration, cfg.lr_power) if dist.get_rank() == 0: print( '======optimizer launch successfully , max_iteration {:}!======='. format(max_iteration)) # train loop loss_avg = [] start_time = glob_start_time = time.time() # for it in range(cfg.max_iter): if args.resume is not None: checkpoint = torch.load(args.resume, map_location='cpu') if checkpoint['iter'] is not None: args.train_mode = 'iter' start_iter = checkpoint['iter'] n_epoch = checkpoint['epoch'] elif checkpoint['epoch'] is not None: args.train_mode = 'epoch' model.module.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer'], checkpoint['iter']) else: if args.train_mode == 'iter': start_iter = 0 n_epoch = 0 elif args.train_mode == 'epoch': start_epoch = 0 if args.train_mode is 'iter': diter = iter(train_loader) for it in range(start_iter, cfg.max_iter): try: sample = next(diter) except StopIteration: n_epoch += 1 sampler.set_epoch(n_epoch) diter = iter(train_loader) sample = next(diter) im, lb = sample['image'].cuda(), sample['label'].cuda() lb = torch.squeeze(lb, 1) optimizer.zero_grad() logits = model(im) loss = criterion(logits, lb) loss.backward() optimizer.step() loss_avg.append(loss.item()) # print training log message if it % cfg.msg_iter == 0 and not it == 0 and dist.get_rank() == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optimizer.lr ed = time.time() t_intv, glob_t_intv = ed - start_time, ed - glob_start_time eta = int((max_iteration - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'iter: {it}/{max_iteration}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it, max_iteration=max_iteration, lr=lr, loss=loss_avg, time=t_intv, eta=eta) # TODO : now the logger.info will error if iter > 350000, so use print haha if max_iteration > 350000: logger.info(msg) else: print(msg) loss_avg = [] it += 1 if (cfg.msg_iter is not None) and (it % cfg.msg_iter == 0) and (it != 0): if args.verbose: logger.info('evaluating the model of iter:{}'.format(it)) model.eval() evaluator = MscEval(cfg, args) mIOU, loss = evaluator(model, criteria=criterion, multi_scale=False) logger.info('mIOU is: {}, loss_eval is {}'.format( mIOU, loss)) model.cpu() save_name = 'iter_{}_naive_model.pth'.format(it) save_pth = osp.join(cfg.respth, save_name) state = model.module.state_dict() if hasattr( model, 'module') else model.state_dict() checkpoint = { 'state_dict': state, 'epoch': n_epoch, 'iter': it, 'optimizer': optimizer.optim.state_dict() } if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('model of iter {} saved to: {}'.format( it, save_pth)) model.cuda() model.train() elif args.train_mode is 'epoch': for epoch in range(start_epoch, cfg.max_epoch): for i, sample in enumerate(train_loader): im = sample['image'].cuda() lb = sample['label'].cuda() lb = torch.squeeze(lb, 1) optimizer.zero_grad() logits = model(im) loss = criterion(logits, lb) loss.backward() optimizer.step() loss_avg.append(loss.item()) # print training log message if i % cfg.msg_iter == 0 and not (i == 0 and epoch == 0) and dist.get_rank() == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optimizer.lr ed = time.time() t_intv, glob_t_intv = ed - start_time, ed - glob_start_time eta = int((max_iteration - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'iter: {it}/{max_iteration}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it, max_iteration=max_iteration, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] # save model and optimizer each epoch if args.verbose: logger.info('evaluating the model of iter:{}'.format(it)) model.eval() evaluator = MscEval(cfg, args) mIOU, loss = evaluator(model, criteria=criterion, multi_scale=False) logger.info('mIOU is: {}, loss_eval is {}'.format(mIOU, loss)) model.cpu() save_name = 'iter_{}_naive_model.pth'.format(it) save_pth = osp.join(cfg.respth, save_name) state = model.module.state_dict() if hasattr( model, 'module') else model.state_dict() checkpoint = { 'state_dict': state, 'epoch': n_epoch, 'iter': it, 'optimizer': optimizer.state_dict() } if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('model of iter {} saved to: {}'.format(it, save_pth)) model.cuda() model.train() else: raise NotImplementedError
def __init__(self, args): warnings.filterwarnings('ignore') assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format( args.backbone, args.dataset, args.exp) if args.dataset == 'pascal': raise NotImplementedError elif args.dataset == 'cityscapes': kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } dataset_loader, num_classes = dataloaders.make_data_loader( args, **kwargs) args.num_classes = num_classes elif args.dataset == 'marsh': kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } dataset_loader, val_loader, test_loader, num_classes = dataloaders.make_data_loader( args, **kwargs) args.num_classes = num_classes else: raise ValueError('Unknown dataset: {}'.format(args.dataset)) if args.backbone == 'autodeeplab': model = Retrain_Autodeeplab(args) else: raise ValueError('Unknown backbone: {}'.format(args.backbone)) if args.criterion == 'Ohem': args.thresh = 0.7 args.crop_size = [args.crop_size, args.crop_size] if isinstance( args.crop_size, int) else args.crop_size args.n_min = int((args.batch_size / len(args.gpu) * args.crop_size[0] * args.crop_size[1]) // 16) criterion = build_criterion(args) model = nn.DataParallel(model).cuda() model.train() if args.freeze_bn: for m in model.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() m.weight.requires_grad = False m.bias.requires_grad = False optimizer = optim.SGD(model.module.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=0.0001) max_iteration = len(dataset_loader) * args.epochs scheduler = Iter_LR_Scheduler(args, max_iteration, len(dataset_loader)) start_epoch = 0 # Resuming checkpoint self.best_pred = 0.0 if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {0}'.format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print('=> loaded checkpoint {0} (epoch {1})'.format( args.resume, checkpoint['epoch'])) self.best_pred = checkpoint['best_pred'] else: raise ValueError('=> no checkpoint found at {0}'.format( args.resume)) ##mergee self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader #kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = dataset_loader, val_loader, test_loader, num_classes self.criterion = criterion self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler #self.scheduler = scheduler self.scheduler = LR_Scheduler( "poly", args.lr, args.epochs, len(self.train_loader)) #removed None from second parameter.