def test_ddp_dist_autograd_sparse_grads(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) model = nn.EmbeddingBag(10, 3, sparse=True) ddp_model = DistributedDataParallel(model) # Different inputs for each input = torch.LongTensor(10).random_(0, 10) offsets = torch.LongTensor([0, 4]) # Run local. loss = ddp_model(input, offsets).sum() loss.backward() with dist_autograd.context() as context_id: loss = ddp_model(input, offsets).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) self.assertEqual(1, len(grads_dict)) self.assertEqual(model.weight.grad, grads_dict[model.weight])
def _run_basic_test(self, backend, checkpoint): dist.init_process_group( backend="nccl", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another # pipe between GPU 2 and 3. Both replicas are replicated via DDP. fc1 = nn.Linear(16, 8).cuda(2 * self.rank) fc2 = nn.Linear(8, 4).cuda(2 * self.rank + 1) model = nn.Sequential( fc1, fc2 ) model = Pipe(model, chunks=2, checkpoint=checkpoint) model = DistributedDataParallel(model) out = model(torch.rand(16, 16).cuda(2 * self.rank)).local_value() out.sum().backward() # Check grads output = [torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad)] dist.all_gather(output, fc1.weight.grad) self.assertEqual(output[0], output[1]) output = [torch.empty_like(fc2.weight.grad), torch.empty_like(fc2.weight.grad)] dist.all_gather(output, fc2.weight.grad) self.assertEqual(output[0], output[1])
def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): gLogger.info(f"Running trainer rank: {self.rank}") # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) net = nn.Linear(2, 3) ddp_net = DistributedDataParallel(net) # Odd ranks join early if simulate_uneven_inputs. num_inputs = 1 if simulate_uneven_inputs: if self.rank % 2 == 0: num_inputs += 2 inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)] if simulate_uneven_inputs: gLogger.info( f"Rank {self.rank} training with {len(inputs_list)} inputs.") # Use distributed autograd. The gradients will be in RPC context map. grads_dict = {} with ddp_net.join(simulate_uneven_inputs): for i, inputs in enumerate(inputs_list): with dist_autograd.context() as context_id: loss = ddp_net(inputs).norm() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) gLogger.info( f"Trainer #{self.rank} got grad dict: {grads_dict}") # Use local autograd. The gradients will be in each variable's '.grad'. ddp_net.zero_grad() loss = ddp_net(inputs).norm() loss.backward() # The gradients should be the same for param in net.parameters(): self.assertTrue( param in grads_dict, msg= f"Param {param} is not in dist_auto grad dict {grads_dict} for iteration {i}", ) self.assertEqual( grads_dict[param], param.grad, msg= f"The grads for param {param} are different under local " f"and dist autograd: {param.grad} \n---\n {grads_dict[param]} for iteration {i}", ) dist.destroy_process_group()
def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, static_graph=False): dist.init_process_group( backend="nccl", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another # pipe between GPU 2 and 3. Both replicas are replicated via DDP. fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank) class MyModule(nn.Module): def __init__(self, device): super(MyModule, self).__init__() self.fc2 = nn.Linear(8, 4, bias=False).cuda(device) self.fc3 = nn.Linear(4, 2, bias=False).cuda(device) def forward(self, inp): if find_unused_parameters: return self.fc2(inp) else: return self.fc3(self.fc2(inp)) layer2 = MyModule(2 * self.rank + 1) model = nn.Sequential( fc1, layer2 ) model = Pipe(model, chunks=2, checkpoint=checkpoint) model = DistributedDataParallel(model, find_unused_parameters=find_unused_parameters) if static_graph: model._set_static_graph() out = model(torch.rand(16, 16).cuda(2 * self.rank)).local_value() out.sum().backward() # Run forward again for find_unused_parameters to trigger any potential errors. if find_unused_parameters: model(torch.rand(16, 16).cuda(2 * self.rank)) # Check grads output = [torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad)] dist.all_gather(output, fc1.weight.grad) self.assertEqual(output[0], output[1]) output = [torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad)] dist.all_gather(output, layer2.fc2.weight.grad) self.assertEqual(output[0], output[1]) if not find_unused_parameters: output = [torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad)] dist.all_gather(output, layer2.fc3.weight.grad) self.assertEqual(output[0], output[1])
def _remote_worker_process(self): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) global shutdown_signal with shutdown_signal: shutdown_signal.wait() gLogger.info("Exiting remote worker.") dist.destroy_process_group()
def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool): gLogger.info("Running the master process...") dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) remote_em_rref = rpc.remote(self.remote_worker_name(), RemoteEM, args=(NUM_EM_ROW, D_SPARSE)) remote_net_rref = rpc.remote(self.remote_worker_name(), RemoteNet, args=(D_DENSE + D_SPARSE, D_HID)) gLogger.info("Created remote rrefs on master") self.do_test_on_master(ddp_mode, simulate_uneven_inputs, remote_em_rref, remote_net_rref)
def test_ddp_dist_autograd_local_vs_remote(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use two different remote device input string, w/ and w/o the default # device string "cpu", respectively. for remote_device in ["worker0/cpu", "worker0"]: remote_layer1 = RemoteModule( remote_device=remote_device, module_cls=nn.Linear, args=(10, 5, False) ) layer1 = nn.Linear(10, 5, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight # Run local case. layer2 = nn.Linear(5, 1) inputs = torch.rand((10, 10)) ddp_model = DistributedDataParallel(layer2) loss = ddp_model(layer1(inputs)).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_model(remote_layer1(inputs)).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer1.weight.grad, rpc.rpc_sync( "worker0", CommonDdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id), ), )
def _remote_worker_process(self, ddp_mode): gLogger.info("The remote worker is running.") dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE): # new_group needs to be called on ranks. dist.new_group(TRAINER_RANKS) global shutdown_signal with shutdown_signal: shutdown_signal.wait() gLogger.info("Exiting remote worker.") dist.destroy_process_group()
def _trainer_process(self, rank: int): gLogger.info(f"Running the trainer #{rank}...") gLogger.info( f"Initing trainer process group by trainer #{rank} with ranks {TRAINER_RANKS}" ) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) gLogger.info(f"Waiting for shutdown signal on trainer #{rank}...") global shutdown_signal with shutdown_signal: shutdown_signal.wait() gLogger.info(f"Exiting the trainer #{rank}...") dist.destroy_process_group()
def test_ddp_dist_autograd_local_vs_remote_gpu(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) remote_layer1 = RemoteModule(remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False)) layer1 = nn.Linear(10, 7, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight layer2 = nn.Linear(7, 5).cuda(self.rank) ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank]) remote_layer3 = RemoteModule(remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False)) layer3 = nn.Linear(5, 3, False) # Start with the same parameters for remote and local layer3.weight = remote_layer3.module_rref.to_here().weight layer4 = nn.Linear(3, 1).cuda(self.rank) ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank]) # Run local case. inputs = torch.rand((10, 10)) loss = ddp_layer4( layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda( self.rank)).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_layer4( remote_layer3( ddp_layer2(remote_layer1(inputs).cuda( self.rank)).cpu()).cuda(self.rank)).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual( layer1.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id), ), ) self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer3.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer3.module_rref, context_id), ), ) self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, static_graph=False): dist.init_process_group( backend=backend, init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another # pipe between GPU 2 and 3. Both replicas are replicated via DDP. fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank) class MyModule(nn.Module): def __init__(self, device): super(MyModule, self).__init__() self.fc2 = nn.Linear(8, 4, bias=False).cuda(device) self.fc3 = nn.Linear(4, 2, bias=False).cuda(device) def forward(self, inp): if find_unused_parameters: return self.fc2(inp) else: return self.fc3(self.fc2(inp)) layer2 = MyModule(2 * self.rank + 1) model = nn.Sequential(fc1, layer2) model = Pipe(model, chunks=2, checkpoint=checkpoint) model = DistributedDataParallel( model, find_unused_parameters=find_unused_parameters, static_graph=static_graph, ) # Ensure inputs are different across ranks to verify that gradient # sync indeed occurs. model_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1) out = model(model_input).local_value() out.sum().backward() # Run forward again for find_unused_parameters to trigger any potential errors. if find_unused_parameters: # Ensure inputs are different across ranks to verify that gradient # sync indeed occurs. unused_param_input = torch.rand(16, 16).cuda( 2 * self.rank) * (self.rank + 1) model(unused_param_input).local_value().sum().backward() # Run a few more iterations of fwd + bwd to ensure gradient synchronization # occurs properly across iterations via delay_all_reduce/bucketized allreduce. for _ in range(3): model_input = torch.rand(16, 16).cuda( 2 * self.rank) * (self.rank + 1) out = model(model_input).local_value() out.sum().backward() # Check grads output = [ torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad) ] dist.all_gather(output, fc1.weight.grad) self.assertEqual(output[0], output[1]) output = [ torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad) ] dist.all_gather(output, layer2.fc2.weight.grad) self.assertEqual(output[0], output[1]) if not find_unused_parameters: output = [ torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad) ] dist.all_gather(output, layer2.fc3.weight.grad) self.assertEqual(output[0], output[1])