def train(epoch, optim, net, criteria, lr_schdr): ## dataset dl = get_data_loader(cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, mode='train') ## meters time_meter, loss_meter = set_meters(epoch) ## train loop for it, (im, lb) in enumerate(tqdm(dl)): im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() aux_loss, main_loss = net(im) aux_criteria = criteria(aux_loss, lb) main_criteria = criteria(main_loss, lb) loss = main_criteria + 0.1 * aux_criteria '''if has_apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else:''' loss = loss.mean() loss.backward() optim.step() time_meter.update() loss_meter.update(loss.item()) lr_schdr.step() return lr_schdr, time_meter, loss_meter
def eval_model(net, ims_per_gpu, im_root, im_anns, n_classes, cropsize): dl = get_data_loader(im_root, im_anns, ims_per_gpu, None, None, mode='test', distributed=False) net.eval() heads, mious = [], [] logger = logging.getLogger() ''' single_scale = MscEvalV0((1., ), False) mIOU = single_scale(net, dl, n_classes) heads.append('single_scale') mious.append(mIOU) logger.info('single mIOU is: %s\n', mIOU) single_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=False, scales=(1., ), lb_ignore=255, ) mIOU = single_crop(net, dl, n_classes) heads.append('single_scale_crop') mious.append(mIOU) logger.info('single scale crop mIOU is: %s\n', mIOU) ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True) mIOU = ms_flip(net, dl, n_classes) heads.append('ms_flip') mious.append(mIOU) logger.info('ms flip mIOU is: %s\n', mIOU) ms_flip_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=True, scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75), lb_ignore=255, ) mIOU = ms_flip_crop(net, dl, n_classes) heads.append('ms_flip_crop') mious.append(mIOU) logger.info('ms crop mIOU is: %s\n', mIOU) ''' windowEval = WindowEval(cropsize=cropsize, cropstride=2. / 3, lb_ignore=255) mIOU = windowEval(net, dl, n_classes) heads.append('window eval') mious.append(mIOU) logger.info('ms crop mIOU is: %s\n', mIOU) return heads, mious
def eval_model(net, ims_per_gpu, im_root, im_anns, it=cfg.epoch): dl = get_data_loader(im_root, im_anns, ims_per_gpu, None, None, mode='val') net.eval() heads, mious = [], [] logger = logging.getLogger() if (it + 1) != cfg.epoch: single_scale = MscEvalV0((1., ), False) mIOU = single_scale(net, dl, 19) heads.append('single_scale') mious.append(mIOU) logger.info('single mIOU is: %s\n', mIOU) return heads, mious, mIOU single_scale = MscEvalV0((1., ), False) mIOUss = single_scale(net, dl, 19) heads.append('single_scale') mious.append(mIOUss) logger.info('single mIOU is: %s\n', mIOUss) single_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=False, scales=(1., ), lb_ignore=255, ) '''mIOU = single_crop(net, dl, 19) heads.append('single_scale_crop') mious.append(mIOU) logger.info('single scale crop mIOU is: %s\n', mIOU) ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True) mIOU = ms_flip(net, dl, 19) heads.append('ms_flip') mious.append(mIOU) logger.info('ms flip mIOU is: %s\n', mIOU) ms_flip_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=True, scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75), lb_ignore=255, ) mIOU = ms_flip_crop(net, dl, 19) heads.append('ms_flip_crop') mious.append(mIOU) logger.info('ms crop mIOU is: %s\n', mIOU)''' return heads, mious, mIOUss
def eval_model(net, ims_per_gpu, im_root, im_anns): is_dist = dist.is_initialized() dl = get_data_loader(im_root, im_anns, ims_per_gpu, None, None, mode='val', distributed=is_dist) net.eval() heads, mious = [], [] logger = logging.getLogger() single_scale = MscEvalV0((1., ), False) mIOU = single_scale(net, dl, 19) heads.append('single_scale') mious.append(mIOU) logger.info('single mIOU is: %s\n', mIOU) single_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=False, scales=(1., ), lb_ignore=255, ) mIOU = single_crop(net, dl, 19) heads.append('single_scale_crop') mious.append(mIOU) logger.info('single scale crop mIOU is: %s\n', mIOU) ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True) mIOU = ms_flip(net, dl, 19) heads.append('ms_flip') mious.append(mIOU) logger.info('ms flip mIOU is: %s\n', mIOU) ms_flip_crop = MscEvalCrop( cropsize=1024, cropstride=2. / 3, flip=True, scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75), lb_ignore=255, ) mIOU = ms_flip_crop(net, dl, 19) heads.append('ms_flip_crop') mious.append(mIOU) logger.info('ms crop mIOU is: %s\n', mIOU) return heads, mious
def eval_model(net, ims_per_gpu, im_root, im_anns,iteration): is_dist = dist.is_initialized() dl = get_data_loader(im_root, im_anns, ims_per_gpu, None, None, mode='val', distributed=is_dist) net.eval() heads, mious = [], [] logger = logging.getLogger() single_scale = MscEvalV0((1., ), False) mIOU = single_scale(net, dl,19,iteration,dist.get_rank() == 0) heads.append('single_scale') mious.append(mIOU) logger.info('single mIOU is: %s\n', mIOU) return heads, mious
def train(): logger = logging.getLogger() is_dist = dist.is_initialized() ## dataset dl = get_data_loader(cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='train', distributed=is_dist) ## model net, criteria_pre, criteria_aux = set_model() ## optimizer optim = set_optimizer(net) ## fp16 if has_apex: opt_level = 'O1' if cfg.use_fp16 else 'O0' net, optim = amp.initialize(net, optim, opt_level=opt_level) ## ddp training net = set_model_dist(net) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler( optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1, ) ## train loop for it, (im, lb) in enumerate(dl): im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [ crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux) ] loss = loss_pre + sum(loss_aux) if has_apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward() optim.step() torch.cuda.synchronize() lr_schdr.step() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [ mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux) ] ## print training log message if (it + 1) % 100 == 0: lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) print_log_msg(it, cfg.max_iter, lr, time_meter, loss_meter, loss_pre_meter, loss_aux_meters) if (it) % 1000 == 0: save_checkpoint('bisenet_citys_{}.pth'.format(it), net.module.state_dict()) ## dump the final model and evaluate the result save_pth = osp.join(cfg.respth, 'model_final.pth') logger.info('\nsave models to {}'.format(save_pth)) state = net.module.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('\nevaluating the final model') torch.cuda.empty_cache() heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns) logger.info(tabulate([ mious, ], headers=heads, tablefmt='orgtbl')) return
def train(): logger = logging.getLogger() is_dist = dist.is_initialized() ## dataset dl = get_data_loader(cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='train', distributed=is_dist) ## model net, criteria_pre, criteria_aux = set_model() ## optimizer optim = set_optimizer(net) ## fp16 if has_apex: opt_level = 'O1' if cfg.use_fp16 else 'O0' net, optim = amp.initialize(net, optim, opt_level=opt_level) ## ddp training net = set_model_dist(net) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler( optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1, ) ##load checkpoin if exits for resuming training if args.loadCheckpointLocation != None: net, optim, lr_schdr, start_iteration = load_ckp( args.loadCheckpointLocation, net, optim, lr_schdr) else: start_iteration = 0 ## train loop for current_it, (im, lb) in enumerate(dl): #on resumed training 'it' will be incremented from what was left else the sum is 0 anyways it = current_it + start_iteration im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [ crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux) ] loss = loss_pre + sum(loss_aux) if has_apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward() optim.step() torch.cuda.synchronize() lr_schdr.step() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [ mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux) ] ## print training log message if (it + 1) % 100 == 0: lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) print_log_msg(it, cfg.max_iter, lr, time_meter, loss_meter, loss_pre_meter, loss_aux_meters) #save the checkpoint on every some iteration if (it + 1) % args.saveOnEveryIt == 0: if args.saveCheckpointDir != None: checkpoint = { 'iteration': it + 1, 'state_dict': net.state_dict(), 'optimizer': optim.state_dict(), 'lr_schdr': lr_schdr.state_dict(), } iteration_no_str = (str(it + 1)).zfill(len(str(cfg.max_iter))) ckt_name = 'checkpoint_it_' + iteration_no_str + '.pt' save_pth = osp.join(args.saveCheckpointDir, ckt_name) logger.info( '\nsaving intermidiate checkpoint to {}'.format(save_pth)) save_ckp(checkpoint, save_pth) ## dump the final model and evaluate the result checkpoint = { 'iteration': cfg.max_iter, 'state_dict': net.state_dict(), 'optimizer': optim.state_dict(), 'lr_schdr': lr_schdr.state_dict(), } save_pth = osp.join(args.saveCheckpointDir, 'model_final.pt') logger.info('\nsave Final models to {}'.format(save_pth)) save_ckp(checkpoint, save_pth) logger.info('\nevaluating the final model') torch.cuda.empty_cache() heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns) logger.info(tabulate([ mious, ], headers=heads, tablefmt='orgtbl')) return
def train(): logger = logging.getLogger() is_dist = False ## dataset dl = get_data_loader( cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='train', distributed=is_dist) valid = get_data_loader( cfg.im_root, cfg.val_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='val', distributed=is_dist ) ## model net, criteria_pre, criteria_aux = set_model() print(net) print(f'n_parameters: {sum(p.numel() for p in net.parameters())}') ## optimizer optim = set_optimizer(net) ## fp16 if has_apex: opt_level = 'O1' if cfg.use_fp16 else 'O0' net, optim = amp.initialize(net, optim, opt_level=opt_level) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler(optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1,) best_validation = np.inf for i in range(cfg.n_epochs): ## train loop for it, (im, lb) in enumerate(Bar(dl)): net.train() im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)] loss = loss_pre + sum(loss_aux) if has_apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward() optim.step() torch.cuda.synchronize() lr_schdr.step() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux)] del im del lb ## print training log message lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) print_log_msg( i, cfg.max_iter, lr, time_meter, loss_meter, loss_pre_meter, loss_aux_meters) ##validation loop validation_loss = [] for it, (im, lb) in enumerate(Bar(valid)): net.eval() im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) with torch.no_grad(): logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)] loss = loss_pre + sum(loss_aux) validation_loss.append(loss.item()) del im del lb ## print training log messag validation_loss = sum(validation_loss)/len(validation_loss) print(f'Validation loss: {validation_loss}') if best_validation > validation_loss: print('new best performance, storing model') best_validation = validation_loss state = net.state_dict() torch.save(state, osp.join(cfg.respth, 'best_validation.pth')) ## dump the final model and evaluate the result save_pth = osp.join(cfg.respth, 'model_final.pth') logger.info('\nsave models to {}'.format(save_pth)) state = net.state_dict() torch.save(state, save_pth) logger.info('\nevaluating the final model') torch.cuda.empty_cache() heads, mious = eval_model(net, 2, cfg.im_root, cfg.test_im_anns) logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl')) return
def train(): logger = logging.getLogger() is_dist = dist.is_initialized() ## dataset dl = get_data_loader(cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='train', distributed=is_dist) ## model net, criteria_pre, criteria_aux = set_model() if dist.get_rank() == 0: exp_name = "cityscapes_repl" wandb.init(project="bisenet", name="cityscapes_repl") wandb.watch(net) ## optimizer optim = set_optimizer(net) ## fp16 if has_apex: opt_level = 'O1' if cfg.use_fp16 else 'O0' net, optim = amp.initialize(net, optim, opt_level=opt_level) ## ddp training net = set_model_dist(net) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler( optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1, ) ## train loop for it, (im, lb) in enumerate(dl): net.train() im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [ crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux) ] loss = loss_pre + sum(loss_aux) if has_apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward() optim.step() torch.cuda.synchronize() lr_schdr.step() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [ mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux) ] lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) ## print training log message if dist.get_rank() == 0: loss_avg = loss_meter.get()[0] wandb.log( { "lr": lr, "time": time_meter.get()[0], "loss": loss_avg, "loss_pre": loss_pre_meter.get()[0], **{ f"loss_aux_{el.name}": el.get()[0] for el in loss_aux_meters } }, commit=False) if (it + 1) % 100 == 0: print(it, ' - ', lr, ' - ', loss_avg) if (it + 1) % 2000 == 0: # dump the model and evaluate the result save_pth = osp.join(cfg.respth, f"{exp_name}_{it}.pth") state = net.module.state_dict() torch.save(state, save_pth) wandb.save(save_pth) if ((it + 1) % 2000 == 0): logger.info('\nevaluating the model') heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns, it) logger.info(tabulate([ mious, ], headers=heads, tablefmt='orgtbl')) if (dist.get_rank() == 0): wandb.log({k: v for k, v in zip(heads, mious)}, commit=False) if (dist.get_rank() == 0): wandb.log({"t": it}, step=it) return