def main(cfgs): Logger.init(**cfgs['logger']) local_rank = cfgs['local_rank'] world_size = int(os.environ['WORLD_SIZE']) Log.info('rank: {}, world_size: {}'.format(local_rank, world_size)) log_dir = cfgs['log_dir'] pth_dir = cfgs['pth_dir'] if local_rank == 0: assure_dir(log_dir) assure_dir(pth_dir) aux_config = cfgs.get('auxiliary', None) network = ModuleBuilder(cfgs['network'], aux_config).cuda() criterion = build_criterion(cfgs['criterion'], aux_config).cuda() optimizer = optim.SGD(network.parameters(), **cfgs['optimizer']) scheduler = PolyLRScheduler(optimizer, **cfgs['scheduler']) dataset = build_dataset(**cfgs['dataset'], **cfgs['transforms']) sampler = DistributedSampler4Iter(dataset, world_size=world_size, rank=local_rank, **cfgs['sampler']) train_loader = DataLoader(dataset, sampler=sampler, **cfgs['loader']) cudnn.benchmark = True torch.manual_seed(666) torch.cuda.manual_seed(666) torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://') model = DistributedDataParallel(network) model = apex.parallel.convert_syncbn_model(model) torch.cuda.empty_cache() train(local_rank, world_size, pth_dir, cfgs['frequency'], criterion, train_loader, model, optimizer, scheduler)
def main(): # make save dir if args.local_rank == 0: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # launch the logger Log.init( log_level=args.log_level, log_file=osp.join(args.save_dir, args.log_file), log_format=args.log_format, rewrite=args.rewrite, stdout_level=args.stdout_level ) # RGB or BGR input(RGB input for ImageNet pretrained models while BGR input for caffe pretrained models) if args.rgb: IMG_MEAN = np.array((0.485, 0.456, 0.406), dtype=np.float32) IMG_VARS = np.array((0.229, 0.224, 0.225), dtype=np.float32) else: IMG_MEAN = np.array((104.00698793, 116.66876762, 122.67891434), dtype=np.float32) IMG_VARS = np.array((1, 1, 1), dtype=np.float32) # set models import libs.models as models deeplab = models.__dict__[args.arch](num_classes=args.num_classes, data_set=args.data_set) if args.restore_from is not None: saved_state_dict = torch.load(args.restore_from, map_location=torch.device('cpu')) new_params = deeplab.state_dict().copy() for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0] == 'fc': new_params['.'.join(i_parts[0:])] = saved_state_dict[i] Log.info("load pretrined models") if deeplab.backbone is not None: deeplab.backbone.load_state_dict(new_params, strict=False) else: deeplab.load_state_dict(new_params, strict=False) else: Log.info("train from stracth") args.world_size = 1 if 'WORLD_SIZE' in os.environ and args.apex: args.apex = int(os.environ['WORLD_SIZE']) > 1 args.world_size = int(os.environ['WORLD_SIZE']) print("Total world size: ", int(os.environ['WORLD_SIZE'])) if not args.gpu == None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu h, w = args.input_size, args.input_size input_size = (h, w) # Set the device according to local_rank. torch.cuda.set_device(args.local_rank) Log.info("Local Rank: {}".format(args.local_rank)) torch.distributed.init_process_group(backend='nccl', init_method='env://') # set optimizer optimizer = optim.SGD( [{'params': filter(lambda p: p.requires_grad, deeplab.parameters()), 'lr': args.learning_rate}], lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) optimizer.zero_grad() # set on cuda deeplab.cuda() # models transformation model = DistributedDataParallel(deeplab) model = apex.parallel.convert_syncbn_model(model) model.train() model.float() model.cuda() # set loss function if args.ohem: criterion = CriterionOhemDSN(thresh=args.ohem_thres, min_kept=args.ohem_keep) # OHEM CrossEntrop if "ic" in args.arch: criterion = CriterionICNet(thresh=args.ohem_thres, min_kept=args.ohem_keep) if "dfa" in args.arch: criterion = CriterionDFANet(thresh=args.ohem_thres, min_kept=args.ohem_keep) else: criterion = CriterionDSN() # CrossEntropy criterion.cuda() cudnn.benchmark = True if args.world_size == 1: print(model) # this is a little different from mul-gpu traning setting in distributed training # because each trainloader is a process that sample from the dataset class. batch_size = args.gpu_num * args.batch_size_per_gpu max_iters = args.num_steps * batch_size / args.gpu_num # set data loader data_set = Cityscapes(args.data_dir, args.data_list, max_iters=max_iters, crop_size=input_size, scale=args.random_scale, mirror=args.random_mirror, mean=IMG_MEAN,vars=IMG_VARS, RGB= args.rgb) trainloader = data.DataLoader( data_set, batch_size=args.batch_size_per_gpu, shuffle=True, num_workers=args.num_workers, pin_memory=True) print("trainloader", len(trainloader)) torch.cuda.empty_cache() # start training: for i_iter, batch in enumerate(trainloader): images, labels = batch images = images.cuda() labels = labels.long().cuda() optimizer.zero_grad() lr = adjust_learning_rate(optimizer, args, i_iter, len(trainloader)) preds = model(images) loss = criterion(preds, labels) loss.backward() optimizer.step() reduce_loss = all_reduce_tensor(loss, world_size=args.gpu_num) if args.local_rank == 0: Log.info('iter = {} of {} completed, lr={}, loss = {}'.format(i_iter, len(trainloader), lr, reduce_loss.data.cpu().numpy())) if i_iter % args.save_pred_every == 0 and i_iter > args.save_start: print('save models ...') torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + str(i_iter) + '.pth')) end = timeit.default_timer() if args.local_rank == 0: Log.info("Training cost: "+ str(end - start) + 'seconds') Log.info("Save final models") torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + '_final' + '.pth'))
def main(): # make save dir if args.local_rank == 0: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # for tensorboard logs tb_path = osp.join(args.save_dir, "runs") writer = SummaryWriter(tb_path) # launch the logger Log.init(log_level=args.log_level, log_file=osp.join(args.save_dir, args.log_file), log_format=args.log_format, rewrite=args.rewrite, stdout_level=args.stdout_level) # RGB or BGR input(RGB input for ImageNet pretrained models while BGR input for caffe pretrained models) if args.rgb: IMG_MEAN = np.array((0.485, 0.456, 0.406), dtype=np.float32) IMG_VARS = np.array((0.229, 0.224, 0.225), dtype=np.float32) else: IMG_MEAN = np.array((104.00698793, 116.66876762, 122.67891434), dtype=np.float32) IMG_VARS = np.array((1, 1, 1), dtype=np.float32) # set models import libs.models as models deeplab = models.__dict__[args.arch](num_classes=args.num_classes) # print(deeplab) if args.restore_from is not None: print("LOADING FROM PRETRAINED MODEL") saved_state_dict = torch.load(args.restore_from, map_location=torch.device('cpu')) new_params = deeplab.state_dict().copy() for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0] == 'fc': new_params['.'.join(i_parts[0:])] = saved_state_dict[i] Log.info("load pretrained models") deeplab.load_state_dict(new_params, strict=False) else: Log.info("train from scratch") args.world_size = 1 if 'WORLD_SIZE' in os.environ and args.apex: args.apex = int(os.environ['WORLD_SIZE']) > 1 args.world_size = int(os.environ['WORLD_SIZE']) print("Total world size: ", int(os.environ['WORLD_SIZE'])) if not args.gpu == None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu h, w = args.input_size, args.input_size input_size = (h, w) # Set the device according to local_rank. # torch.cuda.set_device(args.local_rank) # Log.info("Local Rank: {}".format(args.local_rank)) # torch.distributed.init_process_group(backend='nccl', # init_method='env://') # set optimizer optimizer = optim.SGD( [{ 'params': filter(lambda p: p.requires_grad, deeplab.parameters()), 'lr': args.learning_rate }], lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) optimizer.zero_grad() deeplab.cuda() # models transformation # model = DistributedDataParallel(deeplab) # model = apex.parallel.convert_syncbn_model(model) model = deeplab model.train() model.float() model.cuda() # set loss function if args.ohem: criterion = CriterionOhemDSN( thresh=args.ohem_thres, min_kept=args.ohem_keep) # OHEM CrossEntrop else: criterion = CriterionDSN() # CrossEntropy criterion.cuda() cudnn.benchmark = True # if args.world_size == 1: # print(model) # this is a little different from mul-gpu traning setting in distributed training # because each trainloader is a process that sample from the dataset class. batch_size = args.batch_size_per_gpu max_iters = args.num_steps * batch_size # set data loader #PASCAL - VOC ----------------- from torchvision import transforms augs = transforms.Compose([ transforms.RandomResizedCrop(300), transforms.RandomRotation(20), transforms.ToTensor(), transforms.Normalize([0.4589, 0.4355, 0.4032], [0.2239, 0.2186, 0.2206]) ]) if args.data_set == 'pascalvoc': data_set = VOCSegmentation(args.data_dir, image_set='val', scale=args.random_scale, mean=IMG_MEAN, vars=IMG_VARS, transforms=augs) elif args.data_set == 'cityscapes': data_set = Cityscapes(args.data_dir, args.data_list, crop_size=input_size, scale=args.random_scale, mirror=args.random_mirror, mean=IMG_MEAN, vars=IMG_VARS, RGB=args.rgb) # instance_count = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # for _, label in data_set: # for pixel in label.flatten(): # if(int(pixel) == 255): # pixel = 21 # instance_count[int(pixel)] += 1 # print(instance_count) # sys.exit() trainloader = data.DataLoader(data_set, batch_size=args.batch_size_per_gpu, shuffle=True, num_workers=args.num_workers, pin_memory=True) print("trainloader", len(trainloader)) torch.cuda.empty_cache() # start training: iter_no = 0 for epoch in range(args.num_steps): print("epoch " + str(epoch + 1)) total_loss = 0 total_correct = 0 for i_iter, batch in enumerate(trainloader): if i_iter % 100 == 0: print("iteration " + str(i_iter + 1)) images, labels = batch images = images.cuda() labels = labels.long().cuda() optimizer.zero_grad() lr = adjust_learning_rate(optimizer, args, i_iter, len(trainloader)) preds = model(images) loss = criterion(preds, labels) total_loss += loss.item() writer.add_scalar("Loss_vs_Iteration", loss.item(), iter_no) iter_no += 1 loss.backward() optimizer.step() writer.add_scalar("Loss_vs_Epoch", total_loss / len(trainloader), epoch) # writer.add_scaler("Correct", total_correct, epoch) # writer.add_scaler("Accuracy",total_correct / len(dataset), epoch) # reduce_loss = all_reduce_tensor(loss,world_size=args.gpu_num) # if args.local_rank == 0: # # Log.info('iter = {} of {} completed, lr={}, loss = {}'.format(i_iter, # # len(trainloader), lr, reduce_loss.data.cpu().numpy())) # if i_iter % args.save_pred_every == 0 and i_iter > args.save_start: # print('save models ...') # torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + str(i_iter) + '.pth')) if args.local_rank == 0: if epoch % 9 == 0: print('save models ...') torch.save( deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + str(i_iter) + '.pth')) writer.close() end = timeit.default_timer() if args.local_rank == 0: Log.info("Training cost: " + str(end - start) + 'seconds') Log.info("Save final models") torch.save( deeplab.state_dict(), osp.join( args.save_dir, str(args.arch) + '_' + str(args.num_steps) + 'epoch_' + str(args.batch_size_per_gpu) + '.pth'))