def test(args, model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for batch_idx, (data, target) in enumerate(test_loader): # SM Distributed: Moves input tensors to the GPU ID used by the current process # based on the set_device call. data, target = data.to(device), target.to(device) # Since test_step returns scalars instead of tensors, # test_step decorated with smp.step will return lists instead of StepOutput objects. loss_batch, correct_batch = test_step(model, data, target) test_loss += sum(loss_batch) correct += sum(correct_batch) if args.num_batches and batch_idx + 1 == args.num_batches: break test_loss /= len(test_loader.dataset) if smp.mp_rank() == 0: print("\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n". format( test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset), )) return test_loss
def setup_training(args): assert torch.cuda.is_available() if args.smp > 0: # Initialize SMP. The configuration is obtained from the parameters passed to # the Sagemaker PyTorch estimator. smp.init() # SMP: Set the device to the GPU ID used by the current process. # Input tensors should be transferred to this device. torch.cuda.set_device(smp.local_rank()) device = torch.device("cuda", smp.local_rank()) args.n_gpu = 1 # if args.local_rank == -1: # device = torch.device("cuda") # args.n_gpu = torch.cuda.device_count() # args.allreduce_post_accumulation = False # args.allreduce_post_accumulation_fp16 = False # else: # torch.cuda.set_device(args.local_rank) # device = torch.device("cuda", args.local_rank) # # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl', init_method='env://') # args.n_gpu = 1 if args.gradient_accumulation_steps == 1: args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False print( "device: {} n_gpu: {}, mp_rank: {}, rank: {}, distributed training: {}, 16-bits training: {}" .format(device, args.n_gpu, smp.mp_rank(), smp.rank(), bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if (not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and any([i.startswith("ckpt") for i in os.listdir(args.output_dir)]))): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def is_world_process_zero(self) -> bool: """ Whether or not this process is the global main process (when training in a distributed fashion on several machines, this is only going to be :obj:`True` for one process). """ if self.is_model_parallel_enabled: return smp.rank() == 0 and smp.local_rank() == 0 and smp.mp_rank() == 0 and smp.dp_rank() == 0 else: return super().is_world_process_zero()
def dist_setting(args): # args.data_parallel = False print("args.data_parallel : {}".format(args.data_parallel)) print("args.model_parallel : {}".format(args.model_parallel)) print("args.apex : {}".format(args.apex)) args.world_size = 1 args.host_num = args.hosts.index(args.current_host) if args.data_parallel: args.world_size = sdp.get_world_size() args.rank = sdp.get_rank() # total rank in all hosts args.local_rank = sdp.get_local_rank() # rank per host elif args.model_parallel: args.world_size = smp.size() args.local_rank = smp.local_rank() # rank per host args.rank = smp.rank() args.dp_size = smp.dp_size() args.dp_rank = smp.dp_rank() print( "smp.rank() : {}, smp.size() : {}, smp.mp_rank() : {}, smp.local_size() : {}, smp.get_mp_group() : {}, smp.get_dp_group() : {}, smp.local_rank() : {}, smp.dp_size() : {}, smp.dp_rank() : {}" .format(smp.rank(), smp.size(), smp.mp_rank(), smp.local_size(), smp.get_mp_group(), smp.get_dp_group(), smp.local_rank(), smp.dp_size(), smp.dp_rank())) else: args.world_size = len(args.hosts) * args.num_gpus if args.local_rank is not None: args.rank = args.num_gpus * args.host_num + \ args.local_rank # total rank in all hosts dist.init_process_group(backend=args.backend, rank=args.rank, world_size=args.world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) print("**** [dist_setting] args.rank : {}".format(args.rank)) print("args.world_size : {}".format(args.world_size)) print("Use GPU: {} for training".format(args.local_rank)) args.lr = args.lr * float(args.world_size) args.batch_size //= args.world_size // args.num_gpus args.batch_size = max(args.batch_size, 1) return args