def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.is_distributed: print( "INFO:PyTorch: Initialize process group for distributed training") if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.gpu is not None: if not args.evaluate: print( "INFO:PyTorch: Use GPU: {} for training, the rank of this GPU is {}" .format(args.gpu, args.rank)) else: print( "INFO:PyTorch: Use GPU: {} for evaluating, the rank of this GPU is {}" .format(args.gpu, args.rank)) # set the name of the process setproctitle.setproctitle(args.proc_name + '_rank{}'.format(args.rank)) if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): # define tensorboard summary val_writer = SummaryWriter(log_dir=os.path.join(args.model_dir, 'val')) # define loss function (criterion) and optimizer if args.is_label_smoothing: criterion = label_smoothing.label_smoothing_CE(reduction='mean') else: criterion = nn.CrossEntropyLoss() # create model if args.pretrained: model_info = "INFO:PyTorch: using pre-trained model '{}'".format( args.arch) else: model_info = "INFO:PyTorch: creating model '{}'".format(args.arch) print(model_info) model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=criterion) # print the number of parameters in the model print("INFO:PyTorch: The number of parameters in the model is {}".format( metric.get_the_number_of_params(model))) if args.is_summary: summary_choice = 0 if summary_choice == 0: summary.summary(model, torch.rand((1, 3, args.crop_size, args.crop_size)), target=torch.ones(1, dtype=torch.long)) else: flops, params = profile(model, inputs=(torch.rand((1, 3, args.crop_size, args.crop_size)), torch.ones(1, dtype=torch.long), 'summary')) print(clever_format([flops, params], "%.4f")) return None if args.is_distributed: if args.world_size > 1 and args.is_syncbn: print( "INFO:PyTorch: convert torch.nn.BatchNormND layer in the model to torch.nn.SyncBatchNorm layer" ) # only single gpu per process is currently supported model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # optimizer param_groups = model.parameters( ) if args.is_wd_all else lr_scheduler.get_parameter_groups(model) if args.is_wd_all: print( "INFO:PyTorch: Applying weight decay to all learnable parameters in the model." ) if args.optimizer == 'SGD': print("INFO:PyTorch: using SGD optimizer.") optimizer = torch.optim.SGD( param_groups, args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True if args.is_nesterov else False) elif args.optimizer == "AdamW": print("INFO:PyTorch: using AdamW optimizer.") optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.999), eps=1e-4, weight_decay=args.weight_decay) elif args.optimizer == "RMSprop": # See efficientNet at https://github.com/tensorflow/tpu/ print("INFO:PyTorch: using RMSprop optimizer.") optimizer = torch.optim.RMSprop(param_groups, lr=args.lr, alpha=0.9, weight_decay=args.weight_decay, momentum=0.9) elif args.optimizer == "RMSpropTF": # https://github.com/rwightman/pytorch-image-models/blob/fcb6258877/timm/optim/rmsprop_tf.py print("INFO:PyTorch: using RMSpropTF optimizer.") optimizer = rmsprop_tf.RMSpropTF(param_groups, lr=args.lr, alpha=0.9, eps=0.001, weight_decay=args.weight_decay, momentum=0.9, decoupled_decay=False) else: raise NotImplementedError # PyTorch AMP loss scaler scaler = None if not args.is_amp else amp.GradScaler() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] """ if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) """ model.load_state_dict(checkpoint['state_dict']) print("INFO:PyTorch: Loading state_dict of optimizer") optimizer.load_state_dict(checkpoint['optimizer']) if "scaler" in checkpoint: print("INFO:PyTorch: Loading state_dict of AMP loss scaler") scaler.load_state_dict(checkpoint['scaler']) print("INFO:PyTorch: => loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True # Data loading code data_split_factor = args.loop_factor if args.is_diff_data_train else 1 print("INFO:PyTorch: => The number of views of train data is '{}'".format( data_split_factor)) train_loader, train_sampler = factory.get_data_loader( args.data, split_factor=data_split_factor, batch_size=args.batch_size, crop_size=args.crop_size, dataset=args.dataset, split="train", is_distributed=args.is_distributed, is_autoaugment=args.is_autoaugment, randaa=args.randaa, is_cutout=args.is_cutout, erase_p=args.erase_p, num_workers=args.workers) val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # learning rate scheduler scheduler = lr_scheduler.lr_scheduler( mode=args.lr_mode, init_lr=args.lr, num_epochs=args.epochs, iters_per_epoch=len(train_loader), lr_milestones=args.lr_milestones, lr_step_multiplier=args.lr_step_multiplier, slow_start_epochs=args.slow_start_epochs, slow_start_lr=args.slow_start_lr, end_lr=args.end_lr, multiplier=args.lr_multiplier, decay_factor=args.decay_factor, decay_epochs=args.decay_epochs, staircase=True) if args.evaluate: validate(val_loader, model, args) return None saved_ckpt_filenames = [] streams = None # streams = [torch.cuda.Stream() for i in range(args.loop_factor)] for epoch in range(args.start_epoch, args.epochs + 1): if args.is_distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, optimizer, scheduler, epoch, args, streams, scaler=scaler) if (epoch + 1) % args.eval_per_epoch == 0: # evaluate on validation set acc_all = validate(val_loader, model, args) # remember best acc@1 and save checkpoint is_best = acc_all[0] > best_acc1 best_acc1 = max(acc_all[0], best_acc1) # save checkpoint if not args.multiprocessing_distributed or \ (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): # summary per epoch val_writer.add_scalar('avg_acc1', acc_all[0], global_step=epoch) if args.dataset == 'imagenet': val_writer.add_scalar('avg_acc5', acc_all[1], global_step=epoch) for i in range(2, args.loop_factor + 2): val_writer.add_scalar('{}_acc1'.format(i - 1), acc_all[i], global_step=epoch) val_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch) val_writer.add_scalar('best_acc1', best_acc1, global_step=epoch) # save checkpoints filename = "checkpoint_{0}.pth.tar".format(epoch) saved_ckpt_filenames.append(filename) # remove the oldest file if the number of saved ckpts is greater than args.max_ckpt_nums if len(saved_ckpt_filenames) > args.max_ckpt_nums: os.remove( os.path.join(args.model_dir, saved_ckpt_filenames.pop(0))) ckpt_dict = { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } if args.is_amp: ckpt_dict['scaler'] = scaler.state_dict() metric.save_checkpoint(ckpt_dict, is_best, args.model_dir, filename=filename) # clean GPU cache torch.cuda.empty_cache() sys.exit(0)
def multistreams_test(args): """ This is a simple program for validating the idea of parallel runing of multiple model on single gpu via multi cuda streams. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() # move model to the gpu cuda_models = [] cuda_streams = [] for idx in range(args.split_factor): cuda_streams.append(torch.cuda.Stream()) cuda_models.append(model.models[idx].cuda(0)) torch.cuda.synchronize() # record time and number of samples n_count = 0.0 start_time = time.time() with torch.no_grad(): for i, (images, target) in enumerate(val_loader): images = images.cuda(0, non_blocking=True) target = target.cuda(0, non_blocking=True) collect_outputs = [] if args.is_amp: with torch.cuda.stream(cuda_streams[0]): with amp.autocast(): output_0 = cuda_models[0](images) with torch.cuda.stream(cuda_streams[1]): with amp.autocast(): output_1 = cuda_models[1](images) else: for idx in range(args.split_factor): with torch.cuda.stream(cuda_streams[idx]): collect_outputs.append(cuda_models[idx](images)) torch.cuda.synchronize() collect_outputs.extend([output_0, output_1]) # output is fp16 outputs = torch.stack(collect_outputs, dim=0) ensemble_output = torch.mean(outputs, dim=0) # measure accuracy and record loss batch_size_now = images.size(0) n_count += batch_size_now for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) #if i >= 200: # break if i % args.print_freq == 0: progress.print(i) time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) print(acc_info) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)
def multigpu_test_2gpus(args): """ This is a simple program for validating the idea of parallel runing of multiple model on multiple gpus. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() # move model to the gpu if args.is_test_on_multigpus: print("INFO:PyTorch: multi GPUs test") cuda_models = [] for idx in range(args.split_factor): cuda_models.append(model.models[idx].cuda(idx)) else: print("INFO:PyTorch: single GPU test") model = model.cuda(0) with torch.no_grad(): # record time and number of samples prefetcher = data_prefetcher_2gpus(val_loader, ngpus=args.split_factor) images_gpu0, target, images_gpu1 = prefetcher.next() i = 0 n_count = 0.0 start_time = time.time() while images_gpu0 is not None: i += 1 # for i, (images, target) in enumerate(val_loader): # compute outputs and losses if args.is_test_on_multigpus: if args.is_amp: with amp.autocast(): output_gpu0 = cuda_models[0](images_gpu0) with amp.autocast(): output_gpu1 = cuda_models[1](images_gpu1) else: output_gpu0 = cuda_models[0](images_gpu0) output_gpu1 = cuda_models[1](images_gpu1) if _GEO_TEST: if i == 1: print("using geometry mean") output_gpu0 = F.softmax(output_gpu0, dim=-1) output_gpu1 = F.softmax(output_gpu1, dim=-1) ensemble_output = torch.sqrt(output_gpu0 * output_gpu1.cuda(0)) else: outputs = torch.stack([output_gpu0, output_gpu1.cuda(0)]) ensemble_output = torch.mean(outputs, dim=0) else: # compute outputs and losses if args.is_amp: with amp.autocast(): ensemble_output, outputs, ce_loss = model( images_gpu0, target=target, mode='val') else: ensemble_output, outputs, ce_loss = model(images_gpu0, target=target, mode='val') # measure accuracy and record loss """ target = target.cpu() ensemble_output = ensemble_output.cpu().float() outputs = outputs.cpu().float() """ batch_size_now = images_gpu0.size(0) """ for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) """ # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) images_gpu0, target, images_gpu1 = prefetcher.next() n_count += batch_size_now """ if i % args.print_freq == 0: progress.print(i) """ time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) """ mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) """ print(acc_info) print("multiple GPUs ({})".format(args.is_test_on_multigpus)) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)
def multigpu_test(args): """ This is a simple program for validating the idea of parallel runing of multiple model on multiple gpus. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() n_count = 0.0 # move model to the gpu cuda_models = [] for idx in range(args.split_factor): cuda_models.append(model.models[idx].cuda(idx)) start_time = time.time() for i, (images, target) in enumerate(val_loader): cuda_images = [] cuda_outpouts = [] collect_outputs = [] target = target.cuda(0, non_blocking=True) for idx in range(args.split_factor): cuda_images.append(images.cuda(idx, non_blocking=True)) if args.is_amp: with amp.autocast(): for idx in range(args.split_factor): cuda_outpouts.append(cuda_models[idx](cuda_images[idx])) else: for idx in range(args.split_factor): cuda_outpouts.append(cuda_models[idx](cuda_images[idx])) for idx in range(args.split_factor): # use the first gpu as host gpu collect_outputs.append(cuda_outpouts[idx].cuda(0)) if _GEO_TEST: if i == 1: print("using geometry mean") cmul = 1.0 for j in range(args.split_factor): cmul = cmul * F.softmax(cuda_outpouts[j].cuda(0), dim=-1) # ensemble_output = torch.pow(cmul, 1.0 / args.split_factor) ensemble_output = torch.sqrt(cmul) else: outputs = torch.stack(collect_outputs, dim=0) ensemble_output = torch.mean(outputs, dim=0) batch_size_now = images.size(0) """ for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) """ # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) n_count += batch_size_now """ if i % args.print_freq == 0: progress.print(i) """ time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) """ mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) """ print(acc_info) print("multiple GPUs ({})".format(args.is_test_on_multigpus)) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)