def step(self, autograd_ctx_id): all_local_grads = dist_autograd.get_gradients(autograd_ctx_id) with _LocalOptimizer.global_lock: for param, grad in all_local_grads.items(): param.grad = grad self.optim.step()
def test_restore_context_after_swtich_to_jit_thread(self): if self.rank != 0: return @torch.jit.script def forward_script( context_id: int, dst_worker_name: str, t1: Tensor, t2: Tensor ) -> Tuple[Tensor, Tensor]: res1_fut = rpc.rpc_async(dst_worker_name, local_add, (t1, t1)) res1 = res1_fut.wait() # After this, the script runs in a new JIT thread. loss1 = res1.sum() # SendRpcBackward is not attched, since DistAutogradContext is lost here. res2_fut = rpc.rpc_async(dst_worker_name, local_add, (t2, t2)) res2 = res2_fut.wait() loss2 = res2.sum() return loss1, loss2 with dist_autograd.context() as context_id: t1 = torch.ones((2, 3), requires_grad=True) t2 = torch.ones((2, 3), requires_grad=True) dst_worker_name = worker_name((self.rank + 1) % self.world_size) loss0, loss1 = forward_script(context_id, dst_worker_name, t1, t2) dist_autograd.backward(context_id, [loss0, loss1]) grad0, grad1 = dist_autograd.get_gradients(context_id) self.assertEqual(grad0, grad1)
def test_ddp_dist_autograd_local_vs_remote(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group(backend="gloo", init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank) remote_layer1 = RemoteModule("worker0", nn.Linear, args=(10, 5, False)) layer1 = nn.Linear(10, 5, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight # Run local case. layer2 = nn.Linear(5, 1) inputs = torch.rand((10, 10)) ddp_model = DistributedDataParallel(layer2) loss = ddp_model(layer1(inputs)).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_model(remote_layer1(inputs)).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer1.weight.grad, rpc.rpc_sync("worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id)))
def test_ddp_dist_autograd_sparse_grads(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) model = nn.EmbeddingBag(10, 3, sparse=True) ddp_model = DistributedDataParallel(model) # Different inputs for each input = torch.LongTensor(10).random_(0, 10) offsets = torch.LongTensor([0, 4]) # Run local. loss = ddp_model(input, offsets).sum() loss.backward() with dist_autograd.context() as context_id: loss = ddp_model(input, offsets).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) self.assertEqual(1, len(grads_dict)) self.assertEqual(model.weight.grad, grads_dict[model.weight])
def _test_backward_rref(self, callee, rref_owner): local_grads = None t1 = torch.ones((3, 3), requires_grad=True) t2 = torch.zeros((3, 3), requires_grad=True) local_ret = torch.add(t1, t2) local_ret.sum().backward() with dist_autograd.context() as context_id: rref_t1 = rpc.remote(rref_owner, _torch_ones, args=((3, 3), ), kwargs={"requires_grad": True}) if callee == rref_owner: rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2)) else: rref = rpc.remote(callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2)) ret = rref.to_here().wait() dist_autograd.backward([ret.sum()]) # verify grads on caller grads = dist_autograd.get_gradients(context_id) self.assertIn(t2, grads) self.assertEqual(grads[t2], t2.grad) # verify grads on rref owner self.assertTrue( rpc.rpc_sync(rref_owner, _compare_owner_value, args=(context_id, rref_t1, t1.grad)))
def _run_test_ddp_comparision(self, simulate_uneven_inputs=False): gLogger.info(f"Running trainer rank: {self.rank}") # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank, ) net = nn.Linear(2, 3) ddp_net = DistributedDataParallel(net) # Odd ranks join early if simulate_uneven_inputs. num_inputs = 1 if simulate_uneven_inputs: if self.rank % 2 == 0: num_inputs += 2 inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)] if simulate_uneven_inputs: gLogger.info( f"Rank {self.rank} training with {len(inputs_list)} inputs.") # Use distributed autograd. The gradients will be in RPC context map. grads_dict = {} with ddp_net.join(simulate_uneven_inputs): for i, inputs in enumerate(inputs_list): with dist_autograd.context() as context_id: loss = ddp_net(inputs).norm() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) gLogger.info( f"Trainer #{self.rank} got grad dict: {grads_dict}") # Use local autograd. The gradients will be in each variable's '.grad'. ddp_net.zero_grad() loss = ddp_net(inputs).norm() loss.backward() # The gradients should be the same for param in net.parameters(): self.assertTrue( param in grads_dict, msg= f"Param {param} is not in dist_auto grad dict {grads_dict} for iteration {i}", ) self.assertEqual( grads_dict[param], param.grad, msg= f"The grads for param {param} are different under local " f"and dist autograd: {param.grad} \n---\n {grads_dict[param]} for iteration {i}", ) dist.destroy_process_group()
def step(self, autograd_ctx_id: int): all_local_grads = dist_autograd.get_gradients(autograd_ctx_id) # apply functional optimizer step with a list of gradients grads: List[Optional[Tensor]] = [ all_local_grads[p] if p in all_local_grads else None for p in self._local_params ] self.optim.step(grads)
def get_dist_gradients(self, cid): grads = dist_autograd.get_gradients(cid) # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors. # Tensors must be moved in and out of GPU memory due to this. cpu_grads = {} for k, v in grads.items(): k_cpu, v_cpu = k.to("cpu"), v.to("cpu") cpu_grads[k_cpu] = v_cpu return cpu_grads
def test_jit_fork_within_context(self): with dist_autograd.context() as context_id: t1 = torch.rand((3, 3), requires_grad=True) t2 = torch.rand((3, 3), requires_grad=True) dst_worker_name = worker_name((self.rank + 1) % self.world_size) res = fork_add(t1, t2, dst_worker_name) loss = res.sum() dist_autograd.backward(context_id, [loss]) grads = dist_autograd.get_gradients(context_id) self.assertEqual(2, len(grads)) self.assertIn(t1, grads) self.assertIn(t2, grads)
def backward(devices): device = devices[0].split("/")[1] torch.random.manual_seed(3) criterion = DistributedLoss(torch.nn.MSELoss) x = torch.randn(8, 4).to(device) model = [RemoteModuleParams(nn.Linear, (4, 4), {}), RemoteModuleParams(nn.ReLU, (), {})] pipe = create_sequence_pipeline(model, balance=[1, 1], chunks=4, devices=devices[:2]) with dist_autograd.context() as context_id: y = pipe(x) loss = criterion(y, rpc.RRef(x)) loss.backward(context_id) grads = dist_autograd.get_gradients(context_id) assert len(grads) == 2
def train_batch( self, mini_batch: FeatureSet, trainer_has_less_inputs: bool, simulate_uneven_inputs: bool, ): grads_dict = None if not simulate_uneven_inputs: input_batches = [mini_batch] else: # Split into microbatches, and trim to simulate uneven inputs. dense_features = mini_batch.dense_features sparse_features = mini_batch.sparse_features values = mini_batch.values dense_microbatch = torch.split(dense_features, 2) sparse_microbatch = torch.split(sparse_features, 2) values_microbatch = torch.split(values, 2) batches = [] for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch): feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v) batches.append(feature_set) if trainer_has_less_inputs: input_batches = batches[:len(batches) // 2] gLogger.info( f"""Trainer reduced input patches from {len(batches)} to {len(input_batches)} to simulate uneven inputs.""") else: input_batches = batches with self.hybrid_module.join( ) if simulate_uneven_inputs else contextlib.suppress(): for b in input_batches: with dist_autograd.context() as context_id: output = self.hybrid_module.forward(b) loss = (output * mini_batch.values).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) gLogger.info( f"Loss is {loss} for mini batch: {mini_batch}. " f"Grads dict has {len(grads_dict)} entries: {grads_dict}" ) return ( tuple(grads_dict[param] for param in self.ddp_params), tuple(grads_dict[param] for param in self.non_ddp_params), )
def test_backward_without_rpc(self): dst_rank = self.rank with dist_autograd.context() as context_id: t1 = torch.rand((3, 3), requires_grad=True) t2 = torch.rand((3, 3), requires_grad=True) t3 = torch.add(t1, t2) dist_autograd.backward([t3.sum()]) grads = dist_autograd.get_gradients(context_id) self.assertEqual(2, len(grads)) self.assertIn(t1, grads) self.assertIn(t2, grads) self.assertEqual(torch.ones(3, 3), grads[t1]) self.assertEqual(torch.ones(3, 3), grads[t2])
def train_batch(self, mini_batch: FeatureSet): grads_dict = None with dist_autograd.context() as context_id: output = self.hybrid_module.forward(mini_batch) loss = (output * mini_batch.values).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) gLogger.info( f"Loss is {loss} for mini batch: {mini_batch}. " f"Grads dict has {len(grads_dict)} entries: {grads_dict}") return ( tuple(grads_dict[param] for param in self.ddp_params), tuple(grads_dict[param] for param in self.non_ddp_params), )
def backward(devices): torch.random.manual_seed(3) criterion = DistributedLoss(torch.nn.MSELoss) x = torch.randn(8, 4) model = [("linear1", nn.Linear, (4, 4), {}), ("relu", nn.ReLU, (), {})] pipe = MultiProcessPipe(model, balance=[1, 1], chunks=4, devices=devices[:2]) with dist_autograd.context() as context_id: y = pipe(x) loss = criterion(y, rpc.RRef(x)) loss.backward(context_id) grads = dist_autograd.get_gradients(context_id) assert len(grads) == 2
def _verify_backwards_remote(self, tensors, context_id, local_grads, *args): dist_autograd.backward(tensors) # Verify grads were accumulated appropriately. grads = dist_autograd.get_gradients(context_id) nargs = len(args) ngrads = 0 for i in range(0, nargs): if local_grads[i] is not None: self.assertIn(args[i], grads) self.assertEqual(local_grads[i], grads[args[i]]) ngrads += 1 else: self.assertNotIn(args[i], grads) self.assertEqual(ngrads, len(grads))
def test_trainer_ps(self): local_grads = None t1 = torch.ones((3, 3), requires_grad=True) t2 = torch.zeros((3, 3), requires_grad=True) local_ret = torch.add(t1, t2) local_ret.sum().backward() # create rref on self # TODO: simplify this once we support rpc to self self_name = "worker{}".format(self.rank) rref_t1 = rpc.rpc_sync("worker{}".format(self._next_rank()), _create_ones_rref_on, args=(self_name, (3, 3))) # kick off forward and backward pass on three other workers (trainers) rank_diffs = [1, 2, 3] futures = [] for rank_diff in rank_diffs: futures.append( rpc.rpc_async("worker{}".format( (self.rank + rank_diff) % self.world_size), _run_trainer, args=(rref_t1, t2, self_name, rank_diff))) # check if the trainers have done with their backward pass for rank_diff in rank_diffs: self._check_rpc_done(rank_diff) # trainers are done and holding the context for verification accumulate_grad_func = None for rank_diff in rank_diffs: # make sure grads are accumulated for the same tensors and values # are all correct ctx_id = ctx_ids[rank_diff] grads = dist_autograd.get_gradients(ctx_id) local_t1 = rref_t1.local_value().wait() self.assertIn(local_t1, grads) self.assertEqual(grads[local_t1], t1.grad) # unblock trainers _set_rpc_done(None, 0) # wait until all trainers are done for fut in futures: fut.wait()
def test_ddp_comparison(self): gLogger.info(f"Running trainer rank: {self.rank}") # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method="file://{}".format(self.file_name), world_size=self.world_size, rank=self.rank) net = nn.Linear(2, 3) ddp_net = DistributedDataParallel( net ) inputs = torch.rand((3, 2)) # Use distributed autograd. The gradients will be in RPC context map. grads_dict = {} with dist_autograd.context() as context_id: loss = ddp_net(inputs).norm() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) gLogger.info(f"Trainer #{self.rank} got grad dict: {grads_dict}") # Use local autograd. The gradients will be in each variable's '.grad'. loss = ddp_net(inputs).norm() loss.backward() # The gradients should be the same for param in net.parameters(): self.assertTrue( param in grads_dict, msg=f"Param {param} is not in dist_auto grad dict {grads_dict}", ) self.assertEqual( grads_dict[param], param.grad, msg=f"The grads for param {param} are different under local " f"and dist autograd: {param.grad} \n---\n {grads_dict[param]}", ) dist.destroy_process_group()
def get_remote_grads(rref, context_id): return dist_autograd.get_gradients(context_id)[rref.local_value().weight]
def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]): return dist_autograd.get_gradients(context_id)
def _compare_owner_value(context_id, rref, grad): grads = dist_autograd.get_gradients(context_id) return torch.equal(grads[rref.local_value().wait()], grad)
def dist_get_gradients(context_id): # type: (int) -> (Dict[Tensor, Tensor]) return dist_autograd.get_gradients(context_id)
def test_ddp_dist_autograd_local_vs_remote_gpu(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) remote_layer1 = RemoteModule( remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False) ) layer1 = nn.Linear(10, 7, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight layer2 = nn.Linear(7, 5).cuda(self.rank) ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank]) remote_layer3 = RemoteModule( remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False) ) layer3 = nn.Linear(5, 3, False) # Start with the same parameters for remote and local layer3.weight = remote_layer3.module_rref.to_here().weight layer4 = nn.Linear(3, 1).cuda(self.rank) ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank]) # Run local case. inputs = torch.rand((10, 10)) loss = ddp_layer4( layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda(self.rank) ).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_layer4( remote_layer3( ddp_layer2(remote_layer1(inputs).cuda(self.rank)).cpu() ).cuda(self.rank) ).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual( layer1.weight.grad, rpc.rpc_sync( "worker0", CommonDdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id), ), ) self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer3.weight.grad, rpc.rpc_sync( "worker0", CommonDdpComparisonTest.get_remote_grads, args=(remote_layer3.module_rref, context_id), ), ) self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
def get_dist_gradients(self, cid): return dist_autograd.get_gradients(cid)