def train_net(args): cropsize = [cfgs.crop_height, cfgs.crop_width] # dataset_train = CityScapes(cfgs.data_dir, cropsize=cropsize, mode='train') dataset_train = ContextVoc(cfgs.train_file,cropsize=cropsize, mode='train') dataloader_train = DataLoader( dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True ) # dataset_val = CityScapes(cfgs.data_dir, mode='val') dataset_val = ContextVoc(cfgs.val_file, mode='val') dataloader_val = DataLoader( dataset_val, batch_size=1, shuffle=True, num_workers=args.num_workers, drop_last=True ) # build net os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda if torch.cuda.is_available() and args.use_gpu: device = torch.device('cuda') else: device = torch.device('cpu') net = BiSeNet(cfgs.num_classes, cfgs.netname).to(device) # net = BiSeNet(cfgs.num_classes).to(device) if args.mulgpu: net = torch.nn.DataParallel(net) if args.pretrained_model_path is not None: print('load model from %s ...' % args.pretrained_model_path) load_dict = torch.load(args.pretrained_model_path,map_location=device) dict_new = renamedict(net.module.state_dict(),load_dict) net.module.load_state_dict(dict_new,strict=False) # net.load_state_dict(torch.load(args.pretrained_model_path)) print('Done!') net.train() # build optimizer if args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(net.parameters(), args.learning_rate) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(net.parameters(), args.learning_rate, momentum=0.9, weight_decay=1e-4) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(net.parameters(), args.learning_rate) else: print('not supported optimizer \n') optimizer = None #build loss if args.losstype == 'dice': criterion = DiceLoss() elif args.losstype == 'crossentropy': criterion = torch.nn.CrossEntropyLoss() elif args.losstype == 'ohem': score_thres = 0.7 n_min = args.batch_size * cfgs.crop_height * cfgs.crop_width //16 criterion = OhemCELoss(thresh=score_thres, n_min=n_min) elif args.losstype == 'focal': criterion = SoftmaxFocalLoss() return net,optimizer,criterion,dataloader_train,dataloader_val
def train(verbose=True, **kwargs): args = kwargs['args'] torch.cuda.set_device(args.local_rank) dist.init_process_group( backend = 'nccl', init_method = 'tcp://127.0.0.1:{}'.format(cfg.port), world_size = torch.cuda.device_count(), rank = args.local_rank ) setup_logger(cfg.respth) logger = logging.getLogger() ## dataset ds = CityScapes(cfg, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size = cfg.ims_per_gpu, shuffle = False, sampler = sampler, num_workers = cfg.n_workers, pin_memory = True, drop_last = True) ## model net = Deeplab_v3plus(cfg) net.train() net.cuda() net = nn.parallel.DistributedDataParallel(net, device_ids = [args.local_rank, ], output_device = args.local_rank ) n_min = cfg.ims_per_gpu*cfg.crop_size[0]*cfg.crop_size[1]//16 criteria = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda() ## optimizer optim = Optimizer( net, cfg.lr_start, cfg.momentum, cfg.weight_decay, cfg.warmup_steps, cfg.warmup_start_lr, cfg.max_iter, cfg.lr_power ) ## train loop loss_avg = [] st = glob_st = time.time() diter = iter(dl) n_epoch = 0 for it in range(cfg.max_iter): try: im, lb = next(diter) if not im.size()[0]==cfg.ims_per_gpu: continue except StopIteration: n_epoch += 1 sampler.set_epoch(n_epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() logits = net(im) loss = criteria(logits, lb) loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if it%cfg.msg_iter==0 and not it==0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((cfg.max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds = eta)) msg = ', '.join([ 'iter: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format( it = it, max_it = cfg.max_iter, lr = lr, loss = loss_avg, time = t_intv, eta = eta ) logger.info(msg) loss_avg = [] st = ed ## dump the final model and evaluate the result if verbose: net.cpu() save_pth = osp.join(cfg.respth, 'model_final.pth') state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank()==0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth)) logger.info('evaluating the final model') net.cuda() net.eval() evaluator = MscEval(cfg) mIOU = evaluator(net) logger.info('mIOU is: {}'.format(mIOU))
def train(): args = parse_args() torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:33241', world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(respth) # dataset n_classes = 19 n_img_per_gpu = 16 n_workers = 8 cropsize = [448, 448] data_root = '/home/data2/DATASET/CelebAMask-HQ/' ds = FaceMask(data_root, cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) # model ignore_idx = -100 net = BiSeNet(n_classes=n_classes) net.cuda() net.train() net = nn.parallel.DistributedDataParallel(net, device_ids=[ args.local_rank, ], output_device=args.local_rank) score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed if dist.get_rank() == 0: if (it + 1) % 5000 == 0: state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, './res/cp/{}_iter.pth'.format(it)) evaluate(dspth='/home/data2/DATASET/CelebAMask-HQ/test-img', cp='{}_iter.pth'.format(it)) # dump the final model save_pth = osp.join(respth, 'model_final_diss.pth') # net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(): setup_logger(respth) # dataset gpu_number = torch.cuda.device_count() n_classes = 19 n_img_all_gpu = 16 * gpu_number cropsize = [448, 448] data_root = '/home/data2/DATASET/CelebAMask-HQ/' ds = FaceMask(data_root, cropsize=cropsize, mode='train') dl = DataLoader( ds, batch_size=n_img_all_gpu, shuffle=True, ) # model ignore_idx = -100 net = BiSeNet(n_classes=n_classes) net.cuda() net.train() net = nn.DataParallel(net) net = net.cuda() score_thres = 0.7 n_min = n_img_all_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) # optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) # train loop msg_iter = 2 loss_avg = [] st = glob_st = time.time() diter = iter(dl) for it in range(max_iter): # try: im, lb = next(diter) im = im.cuda() lb = lb.cuda() # H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed # dump the final model save_pth = osp.join(respth, 'model_final_diss.pth') # net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(fintune_model, data_root, respth): # dataset n_classes = 19 n_img_per_gpu = 16 n_workers = 8 cropsize = [448, 448] ds = FaceMask(data_root, cropsize=cropsize, mode='train') # sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=True, num_workers=n_workers, pin_memory=True, drop_last=True) # model ignore_idx = -100 use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") net = BiSeNet(n_classes=n_classes) net = net.to(device) if os.access(fintune_model, os.F_OK) and (fintune_model is not None): # checkpoint chkpt = torch.load(fintune_model, map_location=device) net.load_state_dict(chkpt) print('load fintune model : {}'.format(fintune_model)) else: print('no fintune model') score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_epoch = 1000 optim = Optimizer.SGD(net.parameters(), lr=lr_start, momentum=momentum, weight_decay=weight_decay) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() # diter = iter(dl) epoch = 0 flag_change_lr_cnt = 0 # 学习率更新计数器 init_lr = lr_start # 学习率 best_loss = np.inf loss_mean = 0. # 损失均值 loss_idx = 0. # 损失计算计数器 print('start training ~') it = 0 for epoch in range(max_epoch): net.train() # 学习率更新策略 if loss_mean != 0.: if best_loss > (loss_mean / loss_idx): flag_change_lr_cnt = 0 best_loss = (loss_mean / loss_idx) else: flag_change_lr_cnt += 1 if flag_change_lr_cnt > 30: init_lr = init_lr * 0.1 set_learning_rate(optimizer, init_lr) flag_change_lr_cnt = 0 loss_mean = 0. # 损失均值 loss_idx = 0. # 损失计算计数器 for i, (im, lb) in enumerate(dl): im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss_mean += loss.item() loss_idx += 1. loss.backward() optim.step() if it % msg_iter == 0: print('epoch <{}/{}> -->> <{}/{}> -> iter {} : loss {:.5f}, loss_mean :{:.5f}, best_loss :{:.5f},lr :{:.6f},batch_size : {}'.\ format(epoch,max_epoch,i,int(ds.__len__()/n_img_per_gpu),it,loss.item(),loss_mean/loss_idx,best_loss,init_lr,n_img_per_gpu)) # print(msg) if (it) % 500 == 0: state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() torch.save(state, respth + '/model/face_parse_latest.pth') # evaluate(dspth='./images', cp='{}_iter.pth'.format(it)) it += 1 torch.save(state, respth + '/model/face_parse_epoch_{}.pth'.format(epoch))
def train(args): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:34850', world_size=torch.cuda.device_count(), rank=args.local_rank # rank=0 ) dataset = CityScapes(mode='train') # dataset = CityScapes_trainval(mode='train') # dataset = ADE20K(mode='train') sampler = torch.utils.data.distributed.DistributedSampler(dataset) dataloader = DataLoader(dataset, batch_size=config.imgs_per_gpu, shuffle=False, sampler=sampler, num_workers=4, pin_memory=True, drop_last=True) print(dataloader.__len__()) # net = Origin_Res() net = PANet(config.classes) # net = HighOrder(config.classes) # for i in net.named_modules(): # print(i) # net = Deeplab_v3plus() net.train() net.cuda() net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = nn.parallel.DistributedDataParallel(net, device_ids=[args.local_rank], output_device=args.local_rank) n_min = config.imgs_per_gpu * config.crop_size[0] * config.crop_size[ 1] // 16 criteria = OhemCELoss(thresh=config.ohem_thresh, n_min=n_min, ignore_lb=config.ignore_label).cuda() optimizer = Optimizer(net, config.lr_start, config.momentum, config.weight_decay, config.warmup_steps, config.warmup_start_lr, config.max_iter, config.lr_power) total_loss = 0 n_epoch = 0 data = iter(dataloader) for i in range(config.max_iter): start = time.time() try: image, label, name = next(data) except: n_epoch += 1 sampler.set_epoch(n_epoch) data = iter(dataloader) image, label, name = next(data) image = image.cuda() label = label.cuda() label = torch.squeeze(label, 1) # output = net(image) output, guidence = net(image) # loss = criteria(output, label) loss = 0.9 * criteria(output, label) + 0.1 * criteria(guidence, label) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() if (i + 1) % 100 == 0 and dist.get_rank() == 0: end = time.time() once_time = end - start remaining_step = config.max_iter - i remaining_time = once_time * remaining_step m, s = divmod(remaining_time, 60) h, m = divmod(m, 60) print('iter: {}, loss: {}, time: {}h:{}m'.format( i + 1, total_loss / 100.0, int(h), int(m))) total_loss = 0 if (i + 1) % 100 == 0 and (i + 1) >= (int(config.max_iter) - 200) and dist.get_rank() == 0: torch.save(net.state_dict(), './ablation/15x15global{}.pth'.format(i + 1))
def train(): os.environ["CUDA_VISIBLE_DEVICES"] = '6, 7' args = parse_args() # 1. 初始化horovod hvd.init() # 2. 给当前进程分配对应的gpu,local_rank()返回的是当前是第几个进程 torch.cuda.set_device(hvd.local_rank()) # torch.cuda.set_device(args.local_rank) # dist.init_process_group( # backend = 'nccl', # init_method = 'tcp://127.0.0.1:33271', # world_size = 2, # world_size = torch.cuda.device_count(), # rank=args.local_rank # ) setup_logger(respth) ## dataset n_classes = 19 n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] ds = CityScapes('/dataset/cityscapes/leftImg8bit_trainvaltest', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler( ds, num_replicas=hvd.size(), rank=hvd.rank()) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) ## model ignore_idx = 255 net = BiSeNet(n_classes=n_classes) net.cuda() # 5. 初始化的时候广播参数,这个是为了在一开始的时候同步各个gpu之间的参数 hvd.broadcast_parameters(net.state_dict(), root_rank=0) net.train() # net = nn.parallel.DistributedDataParallel(net, # device_ids = [args.local_rank, ], # output_device = args.local_rank # ) score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 criteria_p = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_16 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_32 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) hvd.broadcast_optimizer_state(optim.optim, root_rank=0) optim = hvd.DistributedOptimizer(optim.optim, named_parameters=net.named_parameters()) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = criteria_p(out, lb) loss2 = criteria_16(out16, lb) loss3 = criteria_32(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed ## dump the final model save_pth = osp.join(respth, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(verbose=True, **kwargs): args = kwargs['args'] torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:{}'.format(cfg.port), world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(cfg.respth) logger = logging.getLogger() ## dataset ds = CityScapes(cfg, mode='train', num_copys=2) sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=cfg.ims_per_gpu, shuffle=False, sampler=sampler, num_workers=cfg.n_workers, collate_fn=collate_fn2, pin_memory=True, drop_last=True) ## model net = Deeplab_v3plus(cfg) net.train() net.cuda() net = nn.parallel.DistributedDataParallel(net, device_ids=[ args.local_rank, ], output_device=args.local_rank) n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16 criteria = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda() Criterion = pgc_loss(use_pgc=[0, 1, 2], criteria=criteria) ## optimizer optim = Optimizer(net, cfg.lr_start, cfg.momentum, cfg.weight_decay, cfg.warmup_steps, cfg.warmup_start_lr, cfg.max_iter, cfg.lr_power) alpha, beta = cfg.alpha, cfg.beta ## train loop loss_avg = [] pgc_avg = [] ce_avg = [] ssp_avg = [] ohem_avg = [] st = glob_st = time.time() diter = iter(dl) n_epoch = 0 for it in range(cfg.max_iter): try: im, lb, overlap, flip = next(diter) if not im.size()[0] != cfg.ims_per_gpu // 2: continue except StopIteration: n_epoch += 1 sampler.set_epoch(n_epoch) diter = iter(dl) im, lb, overlap, flip = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() im1, im2 = im[::2], im[1::2] lb1, lb2 = lb[::2], lb[1::2] logits1 = net(im1) logits2 = net(im2) # logits = torch.cat([logits1[-1], logits2[-1]], dim=0) outputs = [] for f1, f2 in zip(logits1, logits2): outputs.append([f1, f2]) logits = torch.cat([logits1[-1], logits2[-1]], dim=0) mse, sym_ce, mid_mse, mid_ce, mid_l1, ce = Criterion( outputs, overlap, flip, lb) # loss = criteria(logits, lb) loss = beta * sym_ce + ce gc_loss = sum(mid_mse) loss += alpha * gc_loss loss.backward() optim.step() loss_avg.append(loss.item()) ohem_avg.append(ce.item()) pgc_avg.append(gc_loss.item()) ssp_avg.append(sym_ce.item()) ## print training log message if it % cfg.msg_iter == 0 and not it == 0: loss_avg = sum(loss_avg) / len(loss_avg) ohem = sum(ohem_avg) / len(ohem_avg) pgc = sum(pgc_avg) / len(pgc_avg) ssp = sum(ssp_avg) / len(ssp_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((cfg.max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'iter: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'ohem: {ohem:.4f}', 'pgc: {pgc:.4f}', 'ssp: {ssp:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format( it=it, max_it=cfg.max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta, ohem=ohem, pgc=pgc, ssp=ssp, ) logger.info(msg) loss_avg = [] pgc_avg = [] ssp_avg = [] ohem_avg = [] st = ed ## dump the final model and evaluate the result if verbose: net.cpu() save_pth = osp.join(cfg.respth, 'model_final.pth') state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth)) logger.info('evaluating the final model') net.cuda() net.eval() evaluator = MscEval(cfg) mIOU = evaluator(net) logger.info('mIOU is: {}'.format(mIOU))
def train(): args = parse_args() dist.init_process_group( backend='nccl', world_size=torch.cuda.device_count() ) local_rank = torch.distributed.get_rank() torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) setup_logger(respth) # dataset n_classes = 19 n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] ds = CityScapes('../data/cityscapes', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, sampler=sampler, shuffle=False, num_workers=n_workers, pin_memory=True, drop_last=True) logger.info('successful load data') ignore_idx = 255 net = AttaNet(n_classes=n_classes) if not args.ckpt is None: net.load_state_dict(torch.load(args.ckpt, map_location='cpu')) logger.info('successful load weights') net.cuda(device) net.train() net = torch.nn.parallel.DistributedDataParallel(net, find_unused_parameters=True, device_ids=[local_rank], output_device=local_rank) logger.info('successful distributed') score_thres = 0.7 n_min = cropsize[0]*cropsize[1]//2 criteria_p = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_aux1 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_aux2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) # optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 200000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer( model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) # train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = criteria_p(out, lb) loss1 = criteria_aux1(out16, lb) loss2 = criteria_aux2(out32, lb) loss = lossp + loss1 + loss2 loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message if (it+1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format( it=it+1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta ) logger.info(msg) loss_avg = [] st = ed save_pth = osp.join(args.snapshot_dir, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(): args = parse_args() cudnn.benchmark = True # dataset #n_classes = 19 n_classes = 11 n_img_per_gpu = 32 n_workers = 8 #cropsize = [448, 448] cropsize = [224, 224] data_root = 'dataset/parsing/' ds = FaceMask(data_root, cropsize=cropsize, mode='train') dl = DataLoader(ds, batch_size=n_img_per_gpu, num_workers=8, shuffle=True, drop_last=True) # model ignore_idx = -100 #net = BiSeNet(n_classes=n_classes) net = PSP(11, 'resnet50') #net.load_state_dict(torch.load('model_best.pth.tar')["state_dict"]) net = net.cuda() score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 4 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx).cuda() Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx).cuda() ## optimizer #momentum = 0.9 weight_decay = 1e-4 lr_start = 2e-3 args.lr = lr_start max_iter = 120000 train_optimizer = torch.optim.Adam(net.parameters(), lr=lr_start, weight_decay=weight_decay) ## train loop msg_iter = 10 loss_avg = [] diter = iter(dl) epoch = 0 net.train() for it in range(max_iter): adjust_learning_rate(train_optimizer, epoch, it, max_iter, args) try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) train_optimizer.zero_grad() out, out16 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss = lossp + loss2 loss.backward() train_optimizer.step() loss_avg.append(loss.item()) # print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) #lr = train_optimizer.lr msg = ', '.join([ 'it: {it}/{max_it}', 'loss: {loss:.4f}', ]).format( it=it + 1, max_it=max_iter, loss=loss_avg, ) print(msg) loss_avg = [] if (it + 1) % 10000 == 0: state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() torch.save(state, './res/cp/{}_iter.pth'.format(it)) # dump the final model save_pth = osp.join(respth, 'model_final_diss.pth') # net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def evaluate(respth='./res/test_res', dspth='./data', cp='model_final_diss.pth'): if not os.path.exists(respth): os.makedirs(respth) n_classes = 19 net = BiSeNet(n_classes=n_classes) net.cuda() save_pth = osp.join(respth, 'cp', cp) net.load_state_dict(torch.load(save_pth)) net.eval() no_iter = str(int(cp.split('_')[0])) org_respth = respth[:] respth = os.path.join(respth, no_iter) if not os.path.exists(respth): os.makedirs(respth) to_tensor = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) ''' added ''' cropsize = [448, 448] n_img_per_gpu = 16 data_root = '/home/jihyun/workspace/face_parsing/dataset/CelebAMask-HQ/' ds = FaceMask(data_root, cropsize=cropsize, mode='val') dl = DataLoader(ds, batch_size=16, shuffle=False, drop_last=True) n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 score_thres = 0.7 ignore_idx = -100 loss_avg = [] LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) with torch.no_grad(): for i, sample in enumerate(dl): im, lb = sample im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss_avg.append(loss.item()) loss_avg = sum(loss_avg) / len(loss_avg) f = open(osp.join(org_respth, 'loss.log'), 'a') f.write(' eval_loss: ' + str(loss_avg) + '\n') f.close() for image_path in os.listdir(dspth): img = Image.open(osp.join(dspth, image_path)) image = img.resize((512, 512), Image.BILINEAR) img = to_tensor(image) img = torch.unsqueeze(img, 0) img = img.cuda() out, out16, out32 = net(img) parsing = out.squeeze(0).cpu().numpy().argmax(0) vis_parsing_maps(image, parsing, stride=1, save_im=True, save_path=osp.join(respth, image_path))
def train(): args = parse_args() torch.cuda.set_device(args.local_rank) dist.init_process_group( backend = 'nccl', init_method = 'tcp://127.0.0.1:33241', world_size = torch.cuda.device_count(), rank=args.local_rank ) setup_logger(respth) ## dataset n_classes = 19#19 n_img_per_gpu = 5 n_workers = 10#4 cropsize = [1024, 1024] ds = CityScapes('./data/cityscapes', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size = n_img_per_gpu, shuffle = False, sampler = sampler, num_workers = n_workers, pin_memory = True, drop_last = True) ## model ignore_idx = 255 device = torch.device("cuda") net = ShelfNet(n_classes=n_classes) net.load_state_dict(torch.load('./res/model_final_idd.pth')) net.to(device) # net.load_state_dict(checkpoint['model'].module.state_dict()) # net.cuda() net.train() # net.cuda() # net.train() # net = nn.parallel.DistributedDataParallel(net, # device_ids = [args.local_rank, ], # output_device = args.local_rank, # find_unused_parameters=True # ) score_thres = 0.7 n_min = n_img_per_gpu*cropsize[0]*cropsize[1]//16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer( model = net, lr0 = lr_start, momentum = momentum, wd = weight_decay, warmup_steps = warmup_steps, warmup_start_lr = warmup_start_lr, max_iter = max_iter, power = power) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0]==n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if (it+1)%msg_iter==0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format( it = it+1, max_it = max_iter, lr = lr, loss = loss_avg, time = t_intv, eta = eta ) logger.info(msg) loss_avg = [] st = ed if it % (1000) == 0:#1000 ## dump the final model save_pth = osp.join(respth, 'shelfnet_model_it_%d.pth'%it) #net.cpu() #state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() #if dist.get_rank() == 0: torch.save(state, save_pth) torch.save(net.state_dict(),save_pth) if it % (1000) == 0 and it > 0:#1000 evaluate(checkpoint=save_pth) ## dump the final model save_pth = osp.join(respth, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank()==0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(opt): # saving setting opt.saved_path = opt.saved_path + opt.project opt.log_path = os.path.join(opt.saved_path, 'tensorboard') os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # gpu setting os.environ["CUDA_VISIBLE_DEVICES"] = '2, 3, 4, 5, 6' gpu_number = torch.cuda.device_count() # dataset setting n_classes = 17 n_img_all_gpu = opt.batch_size * gpu_number cropsize = [448, 448] data_root = '/home/data2/DATASET/vschallenge' num_workers = opt.num_workers ds = FaceMask(data_root, cropsize=cropsize, mode='train') dl = DataLoader(ds, batch_size=n_img_all_gpu, shuffle=True, num_workers=num_workers, drop_last=True) ds_eval = FaceMask(data_root, cropsize=cropsize, mode='val') dl_eval = DataLoader(ds_eval, batch_size=n_img_all_gpu, shuffle=True, num_workers=num_workers, drop_last=True) ignore_idx = -100 net = BiSeNet(n_classes=n_classes) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = net.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights ' 'with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') net = net.cuda() net = nn.DataParallel(net) score_thres = 0.7 n_min = n_img_all_gpu * cropsize[0] * cropsize[1] // opt.batch_size LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) # optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = opt.lr max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim.optim, patience=3, verbose=True) # train loop loss_avg = [] step = max(0, last_step) max_iter = len(dl) best_epoch = 0 epoch = 0 best_loss = 1e5 net.train() try: for epoch in range(opt.num_epochs): last_epoch = step // max_iter if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(dl) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * max_iter: progress_bar.update() continue try: im = data['img'] lb = data['label'] lb = torch.squeeze(lb, 1) im = im.cuda() lb = lb.cuda() optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 if loss == 0 or not torch.isfinite(loss): continue loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message # progress_bar.set_description( # 'Epoch: {}/{}. Iteration: {}/{}. p_loss: {:.5f}. 2_loss: {:.5f}. 3_loss: {:.5f}. loss_avg: {:.5f}'.format( # epoch, opt.num_epochs, iter + 1, max_iter, lossp.item(), # loss2.item(), loss3.item(), loss.item())) print( 'p_loss: {:.5f}. 2_loss: {:.5f}. 3_loss: {:.5f}. loss_avg: {:.5f}' .format(lossp.item(), loss2.item(), loss3.item(), loss.item())) writer.add_scalars('Lossp', {'train': lossp}, step) writer.add_scalars('loss2', {'train': loss2}, step) writer.add_scalars('loss3', {'train': loss3}, step) writer.add_scalars('loss_avg', {'train': loss}, step) # log learning_rate lr = optim.lr writer.add_scalar('learning_rate', lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') print('checkpoint...') except Exception as e: print('[Erro]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: net.eval() loss_p = [] loss_2 = [] loss_3 = [] for iter, data in enumerate(dl_eval): with torch.no_grad(): im = data['img'] lb = data['label'] lb = torch.squeeze(lb, 1) im = im.cuda() lb = lb.cuda() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 if loss == 0 or not torch.isfinite(loss): continue loss_p.append(lossp.item()) loss_2.append(loss2.item()) loss_3.append(loss3.item()) lossp = np.mean(loss_p) loss2 = np.mean(loss_2) loss3 = np.mean(loss_3) loss = lossp + loss2 + loss3 print( 'Val. Epoch: {}/{}. p_loss: {:1.5f}. 2_loss: {:1.5f}. 3_loss: {:1.5f}. Total_loss: {:1.5f}' .format(epoch, opt.num_epochs, lossp, loss2, loss3, loss)) writer.add_scalars('Total_loss', {'val': loss}, step) writer.add_scalars('p_loss', {'val': lossp}, step) writer.add_scalars('2_loss', {'val': loss2}, step) writer.add_scalars('3_loss', {'val': loss3}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') net.train() # ?? # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, loss)) break except KeyboardInterrupt: save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') writer.close() writer.close()
def train(): args = parse_args() torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:33271', world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(respth) ## dataset n_classes = 19 n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] ds = CityScapes('./data', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) ## model ignore_idx = 255 net = BiSeNet(n_classes=n_classes) net = nn.SyncBatchNorm.convert_sync_batchnorm(net) net.cuda() net.train() score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 criteria_p = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_16 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_32 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer( # model = net.module, model=net, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) ## fp16 net, opt = amp.initialize(net, optim.optim, opt_level='O1') optim.optim = opt ## set dist net = nn.parallel.DistributedDataParallel(net, device_ids=[ args.local_rank, ], output_device=args.local_rank) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = criteria_p(out, lb) loss2 = criteria_16(out16, lb) loss3 = criteria_32(out32, lb) loss = lossp + loss2 + loss3 # loss.backward() with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed ## dump the final model save_pth = osp.join(respth, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))