def _ddp_init_helper(self): """ Initialization helper function that does the following: (1) replicating the module from device[0] to the other devices (2) bucketing the parameters for reductions (3) resetting the bucketing states (4) registering the grad hooks (5) passing a handle of DDP to SyncBatchNorm Layer """ if len(self.device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] self.modules_params_data = [[] for _ in range(len(self.device_ids))] self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] for dev_idx, module in enumerate(self._module_copies): self.modules_params_data[dev_idx] = [ p.data for p in module.parameters() ] self.modules_buffers_data[dev_idx] = [ b.data for b in module.buffers() ] param_list = [ list(filter(lambda p: p.requires_grad, module.parameters())) for module in self._module_copies ] # The bucket size limit is specified in the constructor. # Additionally, we allow for a single small bucket for parameters # that are defined first, such that their gradients don't spill into # a much larger bucket, adding unnecessary latency after gradient # computation finishes. Experiments showed 1MB is a reasonable value. bucket_indices = dist._compute_bucket_assignment_by_size( param_list[0], [1024 * 1024, self.bucket_bytes_cap]) # Note: reverse list of buckets because we want to approximate the # order in which their gradients are produced, and assume they # are used in the forward pass in the order they are defined. self.reducer = dist.Reducer(param_list, list(reversed(bucket_indices)), self.process_group) # passing a handle to torch.nn.SyncBatchNorm layer self._passing_sync_batchnorm_handle(self._module_copies)
def test_multi_limit_single_dtype(self): tensors = [ torch.empty([10], dtype=torch.float), torch.empty([10], dtype=torch.float), torch.empty([10], dtype=torch.float), torch.empty([10], dtype=torch.float), ] result = dist._compute_bucket_assignment_by_size(tensors, [40, 80]) self.assertEqual([[0], [1, 2], [3]], result)
def test_single_limit_multi_dtype(self): tensors = [ torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), ] result = dist._compute_bucket_assignment_by_size(tensors, [400]) self.assertEqual([[0, 2], [1, 3], [4], [5]], result)
def test_single_limit_single_dtype(self): tensors = [ torch.empty([100], dtype=torch.float), torch.empty([200], dtype=torch.float), torch.empty([100], dtype=torch.float), torch.empty([50], dtype=torch.float), ] result, per_bucket_size_limits = dist._compute_bucket_assignment_by_size( tensors, [400]) self.assertTrue( all(size_lim == 400 for size_lim in per_bucket_size_limits)) self.assertEqual([[0], [1], [2], [3]], result)
def test_multi_limit_multi_dtype(self): tensors = [ torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), torch.empty([50], dtype=torch.float), torch.empty([25], dtype=torch.double), ] result, per_bucket_size_limits = dist._compute_bucket_assignment_by_size( tensors, [200, 400]) self.assertEqual([[0], [1], [2, 4], [3, 5]], result) self.assertEqual(per_bucket_size_limits, [200, 200, 400, 400])
def _ddp_init_helper(self): """ Initialization helper function that does the following: (1) replicating the module from device[0] to the other devices (2) bucketing the parameters for reductions (3) resetting the bucketing states (4) registering the grad hooks (5) passing a handle of DDP to SyncBatchNorm Layer """ def parameters(m, recurse=True): def model_parameters(m): ps = m._former_parameters.values() \ if hasattr(m, "_former_parameters") \ else m.parameters(recurse=False) for p in ps: yield p for m in m.modules() if recurse else [m]: for p in model_parameters(m): yield p if self.device_ids and len(self.device_ids) > 1: import warnings warnings.warn( "Single-Process Multi-GPU is not the recommended mode for " "DDP. In this mode, each DDP instance operates on multiple " "devices and creates multiple module replicas within one " "process. The overhead of scatter/gather and GIL contention " "in every forward pass can slow down training. " "Please consider using one DDP instance per device or per " "module replica by explicitly setting device_ids or " "CUDA_VISIBLE_DEVICES. ") # only create replicas for single-device CUDA modules # # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), parameters(module_copy)): # Reducer requires param copies have the same strides across replicas. # Fixes up copy_param strides in case replicate didn't match param strides. if param.layout is torch.strided and param.stride( ) != copy_param.stride(): with torch.no_grad(): copy_param.set_(copy_param.clone().as_strided( param.size(), param.stride()).copy_(copy_param)) copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] self.modules_params = [ list(parameters(m)) for m in self._module_copies ] self.modules_buffers = [list(m.buffers()) for m in self._module_copies] # Build tuple of (module, parameter) for all parameters that require grads. modules_and_parameters = [[ (module, parameter) for module in replica.modules() for parameter in filter(lambda parameter: parameter.requires_grad, parameters(module, recurse=False)) ] for replica in self._module_copies] # Build list of parameters. parameters = [ list(parameter for _, parameter in replica) for replica in modules_and_parameters ] # Checks if a module will produce a sparse gradient. def produces_sparse_gradient(module): if isinstance(module, torch.nn.Embedding): return module.sparse if isinstance(module, torch.nn.EmbeddingBag): return module.sparse return False # Build list of booleans indicating whether or not to expect sparse # gradients for the corresponding parameters. expect_sparse_gradient = [ list(produces_sparse_gradient(module) for module, _ in replica) for replica in modules_and_parameters ] # The bucket size limit is specified in the constructor. # Additionally, we allow for a single small bucket for parameters # that are defined first, such that their gradients don't spill into # a much larger bucket, adding unnecessary latency after gradient # computation finishes. Experiments showed 1MB is a reasonable value. bucket_indices = dist._compute_bucket_assignment_by_size( parameters[0], [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap], expect_sparse_gradient[0]) # Note: reverse list of buckets because we want to approximate the # order in which their gradients are produced, and assume they # are used in the forward pass in the order they are defined. self.reducer = dist.Reducer(parameters, list(reversed(bucket_indices)), self.process_group, expect_sparse_gradient, self.bucket_bytes_cap, self.find_unused_parameters) # passing a handle to torch.nn.SyncBatchNorm layer self._passing_sync_batchnorm_handle(self._module_copies)
def _ddp_init_helper(self): """ Initialization helper function that does the following: (1) replicating the module from device[0] to the other devices (2) bucketing the parameters for reductions (3) resetting the bucketing states (4) registering the grad hooks (5) passing a handle of DDP to SyncBatchNorm Layer """ if self.device_ids and len(self.device_ids) > 1: # only create replicas for single-device CUDA modules # # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] self.modules_params = [ list(m.parameters()) for m in self._module_copies ] self.modules_buffers = [list(m.buffers()) for m in self._module_copies] # Build tuple of (module, parameter) for all parameters that require grads. modules_and_parameters = [[ (module, parameter) for module in replica.modules() for parameter in filter(lambda parameter: parameter.requires_grad, module.parameters(recurse=False)) ] for replica in self._module_copies] # Build list of parameters. parameters = [ list(parameter for _, parameter in replica) for replica in modules_and_parameters ] # Checks if a module will produce a sparse gradient. def produces_sparse_gradient(module): if isinstance(module, torch.nn.Embedding): return module.sparse if isinstance(module, torch.nn.EmbeddingBag): return module.sparse return False # Build list of booleans indicating whether or not to expect sparse # gradients for the corresponding parameters. expect_sparse_gradient = [ list(produces_sparse_gradient(module) for module, _ in replica) for replica in modules_and_parameters ] # The bucket size limit is specified in the constructor. # Additionally, we allow for a single small bucket for parameters # that are defined first, such that their gradients don't spill into # a much larger bucket, adding unnecessary latency after gradient # computation finishes. Experiments showed 1MB is a reasonable value. bucket_indices = dist._compute_bucket_assignment_by_size( parameters[0], [1024 * 1024, self.bucket_bytes_cap], expect_sparse_gradient[0]) # Note: reverse list of buckets because we want to approximate the # order in which their gradients are produced, and assume they # are used in the forward pass in the order they are defined. self.reducer = dist.Reducer(parameters, list(reversed(bucket_indices)), self.process_group, expect_sparse_gradient) # passing a handle to torch.nn.SyncBatchNorm layer self._passing_sync_batchnorm_handle(self._module_copies)