def train(): args = parse_args() torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:33241', world_size=torch.cuda.device_count(), rank=args.local_rank) setup_logger(respth) # dataset n_classes = 19 n_img_per_gpu = 16 n_workers = 8 cropsize = [448, 448] data_root = '/home/data2/DATASET/CelebAMask-HQ/' ds = FaceMask(data_root, cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) # model ignore_idx = -100 net = BiSeNet(n_classes=n_classes) net.cuda() net.train() net = nn.parallel.DistributedDataParallel(net, device_ids=[ args.local_rank, ], output_device=args.local_rank) score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed if dist.get_rank() == 0: if (it + 1) % 5000 == 0: state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, './res/cp/{}_iter.pth'.format(it)) evaluate(dspth='/home/data2/DATASET/CelebAMask-HQ/test-img', cp='{}_iter.pth'.format(it)) # dump the final model save_pth = osp.join(respth, 'model_final_diss.pth') # net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(): setup_logger(respth) # dataset gpu_number = torch.cuda.device_count() n_classes = 19 n_img_all_gpu = 16 * gpu_number cropsize = [448, 448] data_root = '/home/data2/DATASET/CelebAMask-HQ/' ds = FaceMask(data_root, cropsize=cropsize, mode='train') dl = DataLoader( ds, batch_size=n_img_all_gpu, shuffle=True, ) # model ignore_idx = -100 net = BiSeNet(n_classes=n_classes) net.cuda() net.train() net = nn.DataParallel(net) net = net.cuda() score_thres = 0.7 n_min = n_img_all_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) # optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) # train loop msg_iter = 2 loss_avg = [] st = glob_st = time.time() diter = iter(dl) for it in range(max_iter): # try: im, lb = next(diter) im = im.cuda() lb = lb.cuda() # H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed # dump the final model save_pth = osp.join(respth, 'model_final_diss.pth') # net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(): os.environ["CUDA_VISIBLE_DEVICES"] = '6, 7' args = parse_args() # 1. 初始化horovod hvd.init() # 2. 给当前进程分配对应的gpu,local_rank()返回的是当前是第几个进程 torch.cuda.set_device(hvd.local_rank()) # torch.cuda.set_device(args.local_rank) # dist.init_process_group( # backend = 'nccl', # init_method = 'tcp://127.0.0.1:33271', # world_size = 2, # world_size = torch.cuda.device_count(), # rank=args.local_rank # ) setup_logger(respth) ## dataset n_classes = 19 n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] ds = CityScapes('/dataset/cityscapes/leftImg8bit_trainvaltest', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler( ds, num_replicas=hvd.size(), rank=hvd.rank()) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) ## model ignore_idx = 255 net = BiSeNet(n_classes=n_classes) net.cuda() # 5. 初始化的时候广播参数,这个是为了在一开始的时候同步各个gpu之间的参数 hvd.broadcast_parameters(net.state_dict(), root_rank=0) net.train() # net = nn.parallel.DistributedDataParallel(net, # device_ids = [args.local_rank, ], # output_device = args.local_rank # ) score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 criteria_p = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_16 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_32 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) hvd.broadcast_optimizer_state(optim.optim, root_rank=0) optim = hvd.DistributedOptimizer(optim.optim, named_parameters=net.named_parameters()) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = criteria_p(out, lb) loss2 = criteria_16(out16, lb) loss3 = criteria_32(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed ## dump the final model save_pth = osp.join(respth, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def train(fintune_model, data_root, respth): # dataset n_classes = 19 n_img_per_gpu = 16 n_workers = 8 cropsize = [448, 448] ds = FaceMask(data_root, cropsize=cropsize, mode='train') # sampler = torch.utils.data.distributed.DistributedSampler(ds) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=True, num_workers=n_workers, pin_memory=True, drop_last=True) # model ignore_idx = -100 use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") net = BiSeNet(n_classes=n_classes) net = net.to(device) if os.access(fintune_model, os.F_OK) and (fintune_model is not None): # checkpoint chkpt = torch.load(fintune_model, map_location=device) net.load_state_dict(chkpt) print('load fintune model : {}'.format(fintune_model)) else: print('no fintune model') score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_epoch = 1000 optim = Optimizer.SGD(net.parameters(), lr=lr_start, momentum=momentum, weight_decay=weight_decay) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() # diter = iter(dl) epoch = 0 flag_change_lr_cnt = 0 # 学习率更新计数器 init_lr = lr_start # 学习率 best_loss = np.inf loss_mean = 0. # 损失均值 loss_idx = 0. # 损失计算计数器 print('start training ~') it = 0 for epoch in range(max_epoch): net.train() # 学习率更新策略 if loss_mean != 0.: if best_loss > (loss_mean / loss_idx): flag_change_lr_cnt = 0 best_loss = (loss_mean / loss_idx) else: flag_change_lr_cnt += 1 if flag_change_lr_cnt > 30: init_lr = init_lr * 0.1 set_learning_rate(optimizer, init_lr) flag_change_lr_cnt = 0 loss_mean = 0. # 损失均值 loss_idx = 0. # 损失计算计数器 for i, (im, lb) in enumerate(dl): im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 loss_mean += loss.item() loss_idx += 1. loss.backward() optim.step() if it % msg_iter == 0: print('epoch <{}/{}> -->> <{}/{}> -> iter {} : loss {:.5f}, loss_mean :{:.5f}, best_loss :{:.5f},lr :{:.6f},batch_size : {}'.\ format(epoch,max_epoch,i,int(ds.__len__()/n_img_per_gpu),it,loss.item(),loss_mean/loss_idx,best_loss,init_lr,n_img_per_gpu)) # print(msg) if (it) % 500 == 0: state = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() torch.save(state, respth + '/model/face_parse_latest.pth') # evaluate(dspth='./images', cp='{}_iter.pth'.format(it)) it += 1 torch.save(state, respth + '/model/face_parse_epoch_{}.pth'.format(epoch))
def train(opt): # saving setting opt.saved_path = opt.saved_path + opt.project opt.log_path = os.path.join(opt.saved_path, 'tensorboard') os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # gpu setting os.environ["CUDA_VISIBLE_DEVICES"] = '2, 3, 4, 5, 6' gpu_number = torch.cuda.device_count() # dataset setting n_classes = 17 n_img_all_gpu = opt.batch_size * gpu_number cropsize = [448, 448] data_root = '/home/data2/DATASET/vschallenge' num_workers = opt.num_workers ds = FaceMask(data_root, cropsize=cropsize, mode='train') dl = DataLoader(ds, batch_size=n_img_all_gpu, shuffle=True, num_workers=num_workers, drop_last=True) ds_eval = FaceMask(data_root, cropsize=cropsize, mode='val') dl_eval = DataLoader(ds_eval, batch_size=n_img_all_gpu, shuffle=True, num_workers=num_workers, drop_last=True) ignore_idx = -100 net = BiSeNet(n_classes=n_classes) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = net.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights ' 'with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') net = net.cuda() net = nn.DataParallel(net) score_thres = 0.7 n_min = n_img_all_gpu * cropsize[0] * cropsize[1] // opt.batch_size LossP = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss2 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) Loss3 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) # optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = opt.lr max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net.module, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim.optim, patience=3, verbose=True) # train loop loss_avg = [] step = max(0, last_step) max_iter = len(dl) best_epoch = 0 epoch = 0 best_loss = 1e5 net.train() try: for epoch in range(opt.num_epochs): last_epoch = step // max_iter if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(dl) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * max_iter: progress_bar.update() continue try: im = data['img'] lb = data['label'] lb = torch.squeeze(lb, 1) im = im.cuda() lb = lb.cuda() optim.zero_grad() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 if loss == 0 or not torch.isfinite(loss): continue loss.backward() optim.step() loss_avg.append(loss.item()) # print training log message # progress_bar.set_description( # 'Epoch: {}/{}. Iteration: {}/{}. p_loss: {:.5f}. 2_loss: {:.5f}. 3_loss: {:.5f}. loss_avg: {:.5f}'.format( # epoch, opt.num_epochs, iter + 1, max_iter, lossp.item(), # loss2.item(), loss3.item(), loss.item())) print( 'p_loss: {:.5f}. 2_loss: {:.5f}. 3_loss: {:.5f}. loss_avg: {:.5f}' .format(lossp.item(), loss2.item(), loss3.item(), loss.item())) writer.add_scalars('Lossp', {'train': lossp}, step) writer.add_scalars('loss2', {'train': loss2}, step) writer.add_scalars('loss3', {'train': loss3}, step) writer.add_scalars('loss_avg', {'train': loss}, step) # log learning_rate lr = optim.lr writer.add_scalar('learning_rate', lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') print('checkpoint...') except Exception as e: print('[Erro]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % opt.val_interval == 0: net.eval() loss_p = [] loss_2 = [] loss_3 = [] for iter, data in enumerate(dl_eval): with torch.no_grad(): im = data['img'] lb = data['label'] lb = torch.squeeze(lb, 1) im = im.cuda() lb = lb.cuda() out, out16, out32 = net(im) lossp = LossP(out, lb) loss2 = Loss2(out16, lb) loss3 = Loss3(out32, lb) loss = lossp + loss2 + loss3 if loss == 0 or not torch.isfinite(loss): continue loss_p.append(lossp.item()) loss_2.append(loss2.item()) loss_3.append(loss3.item()) lossp = np.mean(loss_p) loss2 = np.mean(loss_2) loss3 = np.mean(loss_3) loss = lossp + loss2 + loss3 print( 'Val. Epoch: {}/{}. p_loss: {:1.5f}. 2_loss: {:1.5f}. 3_loss: {:1.5f}. Total_loss: {:1.5f}' .format(epoch, opt.num_epochs, lossp, loss2, loss3, loss)) writer.add_scalars('Total_loss', {'val': loss}, step) writer.add_scalars('p_loss', {'val': lossp}, step) writer.add_scalars('2_loss', {'val': loss2}, step) writer.add_scalars('3_loss', {'val': loss3}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') net.train() # ?? # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, loss)) break except KeyboardInterrupt: save_checkpoint(net, f'Bisenet_{epoch}_{step}.pth') writer.close() writer.close()