예제 #1
0
    def step(self, autograd_ctx_id):
        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)

        with _LocalOptimizer.global_lock:
            for param, grad in all_local_grads.items():
                param.grad = grad
            self.optim.step()
예제 #2
0
    def test_restore_context_after_swtich_to_jit_thread(self):
        if self.rank != 0:
            return

        @torch.jit.script
        def forward_script(
            context_id: int, dst_worker_name: str, t1: Tensor, t2: Tensor
        ) -> Tuple[Tensor, Tensor]:
            res1_fut = rpc.rpc_async(dst_worker_name, local_add, (t1, t1))
            res1 = res1_fut.wait()  # After this, the script runs in a new JIT thread.
            loss1 = res1.sum()

            # SendRpcBackward is not attched, since DistAutogradContext is lost here.
            res2_fut = rpc.rpc_async(dst_worker_name, local_add, (t2, t2))
            res2 = res2_fut.wait()
            loss2 = res2.sum()

            return loss1, loss2

        with dist_autograd.context() as context_id:
            t1 = torch.ones((2, 3), requires_grad=True)
            t2 = torch.ones((2, 3), requires_grad=True)
            dst_worker_name = worker_name((self.rank + 1) % self.world_size)
            loss0, loss1 = forward_script(context_id, dst_worker_name, t1, t2)
            dist_autograd.backward(context_id, [loss0, loss1])
            grad0, grad1 = dist_autograd.get_gradients(context_id)
            self.assertEqual(grad0, grad1)
예제 #3
0
    def test_ddp_dist_autograd_local_vs_remote(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(backend="gloo",
                                init_method="file://{}".format(self.file_name),
                                world_size=self.world_size,
                                rank=self.rank)

        remote_layer1 = RemoteModule("worker0", nn.Linear, args=(10, 5, False))
        layer1 = nn.Linear(10, 5, False)
        # Start with the same parameters for remote and local
        layer1.weight = remote_layer1.module_rref.to_here().weight

        # Run local case.
        layer2 = nn.Linear(5, 1)
        inputs = torch.rand((10, 10))
        ddp_model = DistributedDataParallel(layer2)
        loss = ddp_model(layer1(inputs)).sum()
        loss.backward()

        # Run remote case.
        with dist_autograd.context() as context_id:
            loss = ddp_model(remote_layer1(inputs)).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            dist.barrier()
            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
            self.assertEqual(
                layer1.weight.grad,
                rpc.rpc_sync("worker0",
                             DdpComparisonTest.get_remote_grads,
                             args=(remote_layer1.module_rref, context_id)))
    def test_ddp_dist_autograd_sparse_grads(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        model = nn.EmbeddingBag(10, 3, sparse=True)
        ddp_model = DistributedDataParallel(model)

        # Different inputs for each
        input = torch.LongTensor(10).random_(0, 10)
        offsets = torch.LongTensor([0, 4])

        # Run local.
        loss = ddp_model(input, offsets).sum()
        loss.backward()

        with dist_autograd.context() as context_id:
            loss = ddp_model(input, offsets).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            self.assertEqual(1, len(grads_dict))
            self.assertEqual(model.weight.grad, grads_dict[model.weight])
예제 #5
0
    def _test_backward_rref(self, callee, rref_owner):
        local_grads = None
        t1 = torch.ones((3, 3), requires_grad=True)
        t2 = torch.zeros((3, 3), requires_grad=True)

        local_ret = torch.add(t1, t2)
        local_ret.sum().backward()
        with dist_autograd.context() as context_id:
            rref_t1 = rpc.remote(rref_owner,
                                 _torch_ones,
                                 args=((3, 3), ),
                                 kwargs={"requires_grad": True})

            if callee == rref_owner:
                rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2))
            else:
                rref = rpc.remote(callee,
                                  my_nested_rref_add,
                                  args=(rref_owner, rref_t1, t2))
            ret = rref.to_here().wait()
            dist_autograd.backward([ret.sum()])

            # verify grads on caller
            grads = dist_autograd.get_gradients(context_id)
            self.assertIn(t2, grads)
            self.assertEqual(grads[t2], t2.grad)

            # verify grads on rref owner
            self.assertTrue(
                rpc.rpc_sync(rref_owner,
                             _compare_owner_value,
                             args=(context_id, rref_t1, t1.grad)))
    def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
        gLogger.info(f"Running trainer rank: {self.rank}")
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method="file://{}".format(self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )
        net = nn.Linear(2, 3)
        ddp_net = DistributedDataParallel(net)

        # Odd ranks join early if simulate_uneven_inputs.
        num_inputs = 1
        if simulate_uneven_inputs:
            if self.rank % 2 == 0:
                num_inputs += 2
        inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)]

        if simulate_uneven_inputs:
            gLogger.info(
                f"Rank {self.rank} training with {len(inputs_list)} inputs.")

        # Use distributed autograd. The gradients will be in RPC context map.
        grads_dict = {}
        with ddp_net.join(simulate_uneven_inputs):
            for i, inputs in enumerate(inputs_list):
                with dist_autograd.context() as context_id:
                    loss = ddp_net(inputs).norm()
                    dist_autograd.backward(context_id, [loss])
                    grads_dict = dist_autograd.get_gradients(context_id)
                gLogger.info(
                    f"Trainer #{self.rank} got grad dict: {grads_dict}")

                # Use local autograd. The gradients will be in each variable's '.grad'.
                ddp_net.zero_grad()
                loss = ddp_net(inputs).norm()
                loss.backward()

                # The gradients should be the same
                for param in net.parameters():
                    self.assertTrue(
                        param in grads_dict,
                        msg=
                        f"Param {param} is not in dist_auto grad dict {grads_dict} for iteration {i}",
                    )
                    self.assertEqual(
                        grads_dict[param],
                        param.grad,
                        msg=
                        f"The grads for param {param} are different under local "
                        f"and dist autograd: {param.grad} \n---\n {grads_dict[param]} for iteration {i}",
                    )
        dist.destroy_process_group()
예제 #7
0
    def step(self, autograd_ctx_id: int):
        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
        # apply functional optimizer step with a list of gradients
        grads: List[Optional[Tensor]] = [
            all_local_grads[p] if p in all_local_grads else None
            for p in self._local_params
        ]

        self.optim.step(grads)
 def get_dist_gradients(self, cid):
     grads = dist_autograd.get_gradients(cid)
     # This output is forwarded over RPC, which as of 1.5.0 only accepts CPU tensors.
     # Tensors must be moved in and out of GPU memory due to this.
     cpu_grads = {}
     for k, v in grads.items():
         k_cpu, v_cpu = k.to("cpu"), v.to("cpu")
         cpu_grads[k_cpu] = v_cpu
     return cpu_grads
예제 #9
0
    def test_jit_fork_within_context(self):
        with dist_autograd.context() as context_id:
            t1 = torch.rand((3, 3), requires_grad=True)
            t2 = torch.rand((3, 3), requires_grad=True)
            dst_worker_name = worker_name((self.rank + 1) % self.world_size)
            res = fork_add(t1, t2, dst_worker_name)
            loss = res.sum()
            dist_autograd.backward(context_id, [loss])

            grads = dist_autograd.get_gradients(context_id)
            self.assertEqual(2, len(grads))
            self.assertIn(t1, grads)
            self.assertIn(t2, grads)
예제 #10
0
def backward(devices):
    device = devices[0].split("/")[1]
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4).to(device)
    model = [RemoteModuleParams(nn.Linear, (4, 4), {}), RemoteModuleParams(nn.ReLU, (), {})]
    pipe = create_sequence_pipeline(model, balance=[1, 1], chunks=4, devices=devices[:2])
    with dist_autograd.context() as context_id:
        y = pipe(x)
        loss = criterion(y, rpc.RRef(x))
        loss.backward(context_id)
        grads = dist_autograd.get_gradients(context_id)
    assert len(grads) == 2
예제 #11
0
    def train_batch(
        self,
        mini_batch: FeatureSet,
        trainer_has_less_inputs: bool,
        simulate_uneven_inputs: bool,
    ):
        grads_dict = None

        if not simulate_uneven_inputs:
            input_batches = [mini_batch]
        else:
            # Split into microbatches, and trim to simulate uneven inputs.
            dense_features = mini_batch.dense_features
            sparse_features = mini_batch.sparse_features
            values = mini_batch.values

            dense_microbatch = torch.split(dense_features, 2)
            sparse_microbatch = torch.split(sparse_features, 2)
            values_microbatch = torch.split(values, 2)
            batches = []
            for d, s, v in zip(dense_microbatch, sparse_microbatch,
                               values_microbatch):
                feature_set = FeatureSet(dense_features=d,
                                         sparse_features=s,
                                         values=v)
                batches.append(feature_set)

            if trainer_has_less_inputs:
                input_batches = batches[:len(batches) // 2]
                gLogger.info(
                    f"""Trainer reduced input patches from {len(batches)}
                    to {len(input_batches)} to simulate uneven inputs.""")
            else:
                input_batches = batches

        with self.hybrid_module.join(
        ) if simulate_uneven_inputs else contextlib.suppress():
            for b in input_batches:
                with dist_autograd.context() as context_id:
                    output = self.hybrid_module.forward(b)
                    loss = (output * mini_batch.values).sum()
                    dist_autograd.backward(context_id, [loss])
                    grads_dict = dist_autograd.get_gradients(context_id)
                    gLogger.info(
                        f"Loss is {loss} for mini batch: {mini_batch}. "
                        f"Grads dict has {len(grads_dict)} entries: {grads_dict}"
                    )
        return (
            tuple(grads_dict[param] for param in self.ddp_params),
            tuple(grads_dict[param] for param in self.non_ddp_params),
        )
예제 #12
0
    def test_backward_without_rpc(self):
        dst_rank = self.rank
        with dist_autograd.context() as context_id:
            t1 = torch.rand((3, 3), requires_grad=True)
            t2 = torch.rand((3, 3), requires_grad=True)
            t3 = torch.add(t1, t2)

            dist_autograd.backward([t3.sum()])
            grads = dist_autograd.get_gradients(context_id)
            self.assertEqual(2, len(grads))
            self.assertIn(t1, grads)
            self.assertIn(t2, grads)
            self.assertEqual(torch.ones(3, 3), grads[t1])
            self.assertEqual(torch.ones(3, 3), grads[t2])
 def train_batch(self, mini_batch: FeatureSet):
     grads_dict = None
     with dist_autograd.context() as context_id:
         output = self.hybrid_module.forward(mini_batch)
         loss = (output * mini_batch.values).sum()
         dist_autograd.backward(context_id, [loss])
         grads_dict = dist_autograd.get_gradients(context_id)
         gLogger.info(
             f"Loss is {loss} for mini batch: {mini_batch}. "
             f"Grads dict has {len(grads_dict)} entries: {grads_dict}")
     return (
         tuple(grads_dict[param] for param in self.ddp_params),
         tuple(grads_dict[param] for param in self.non_ddp_params),
     )
def backward(devices):
    torch.random.manual_seed(3)
    criterion = DistributedLoss(torch.nn.MSELoss)
    x = torch.randn(8, 4)
    model = [("linear1", nn.Linear, (4, 4), {}), ("relu", nn.ReLU, (), {})]
    pipe = MultiProcessPipe(model,
                            balance=[1, 1],
                            chunks=4,
                            devices=devices[:2])
    with dist_autograd.context() as context_id:
        y = pipe(x)
        loss = criterion(y, rpc.RRef(x))
        loss.backward(context_id)
        grads = dist_autograd.get_gradients(context_id)
    assert len(grads) == 2
예제 #15
0
    def _verify_backwards_remote(self, tensors, context_id, local_grads, *args):
        dist_autograd.backward(tensors)

        # Verify grads were accumulated appropriately.
        grads = dist_autograd.get_gradients(context_id)
        nargs = len(args)
        ngrads = 0
        for i in range(0, nargs):
            if local_grads[i] is not None:
                self.assertIn(args[i], grads)
                self.assertEqual(local_grads[i], grads[args[i]])
                ngrads += 1
            else:
                self.assertNotIn(args[i], grads)

        self.assertEqual(ngrads, len(grads))
예제 #16
0
    def test_trainer_ps(self):
        local_grads = None
        t1 = torch.ones((3, 3), requires_grad=True)
        t2 = torch.zeros((3, 3), requires_grad=True)

        local_ret = torch.add(t1, t2)
        local_ret.sum().backward()

        # create rref on self
        # TODO: simplify this once we support rpc to self
        self_name = "worker{}".format(self.rank)
        rref_t1 = rpc.rpc_sync("worker{}".format(self._next_rank()),
                               _create_ones_rref_on,
                               args=(self_name, (3, 3)))

        # kick off forward and backward pass on three other workers (trainers)
        rank_diffs = [1, 2, 3]
        futures = []
        for rank_diff in rank_diffs:
            futures.append(
                rpc.rpc_async("worker{}".format(
                    (self.rank + rank_diff) % self.world_size),
                              _run_trainer,
                              args=(rref_t1, t2, self_name, rank_diff)))

        # check if the trainers have done with their backward pass
        for rank_diff in rank_diffs:
            self._check_rpc_done(rank_diff)

        # trainers are done and holding the context for verification
        accumulate_grad_func = None
        for rank_diff in rank_diffs:
            # make sure grads are accumulated for the same tensors and values
            # are all correct
            ctx_id = ctx_ids[rank_diff]
            grads = dist_autograd.get_gradients(ctx_id)
            local_t1 = rref_t1.local_value().wait()
            self.assertIn(local_t1, grads)
            self.assertEqual(grads[local_t1], t1.grad)

        # unblock trainers
        _set_rpc_done(None, 0)

        # wait until all trainers are done
        for fut in futures:
            fut.wait()
    def test_ddp_comparison(self):
        gLogger.info(f"Running trainer rank: {self.rank}")
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method="file://{}".format(self.file_name),
            world_size=self.world_size,
            rank=self.rank)
        net = nn.Linear(2, 3)
        ddp_net = DistributedDataParallel(
            net
        )
        inputs = torch.rand((3, 2))

        # Use distributed autograd. The gradients will be in RPC context map.
        grads_dict = {}
        with dist_autograd.context() as context_id:
            loss = ddp_net(inputs).norm()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
        gLogger.info(f"Trainer #{self.rank} got grad dict: {grads_dict}")

        # Use local autograd. The gradients will be in each variable's '.grad'.
        loss = ddp_net(inputs).norm()
        loss.backward()

        # The gradients should be the same
        for param in net.parameters():
            self.assertTrue(
                param in grads_dict,
                msg=f"Param {param} is not in dist_auto grad dict {grads_dict}",
            )
            self.assertEqual(
                grads_dict[param],
                param.grad,
                msg=f"The grads for param {param} are different under local "
                f"and dist autograd: {param.grad} \n---\n {grads_dict[param]}",
            )
        dist.destroy_process_group()
 def get_remote_grads(rref, context_id):
     return dist_autograd.get_gradients(context_id)[rref.local_value().weight]
예제 #19
0
 def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]):
     return dist_autograd.get_gradients(context_id)
예제 #20
0
def _compare_owner_value(context_id, rref, grad):
    grads = dist_autograd.get_gradients(context_id)
    return torch.equal(grads[rref.local_value().wait()], grad)
예제 #21
0
 def dist_get_gradients(context_id):
     # type: (int) -> (Dict[Tensor, Tensor])
     return dist_autograd.get_gradients(context_id)
    def test_ddp_dist_autograd_local_vs_remote_gpu(self):
        # Each trainer uses a different random seed. Otherwise, they are going
        # to have exactly the same initial model parameters, input, and
        # therefore grads. That means the grads will be the same before and
        # after DDP's all-reduce.
        torch.manual_seed(self.rank)
        dist.init_process_group(
            backend="gloo",
            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
            world_size=self.world_size,
            rank=self.rank,
        )

        remote_layer1 = RemoteModule(
            remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False)
        )
        layer1 = nn.Linear(10, 7, False)
        # Start with the same parameters for remote and local
        layer1.weight = remote_layer1.module_rref.to_here().weight

        layer2 = nn.Linear(7, 5).cuda(self.rank)
        ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank])

        remote_layer3 = RemoteModule(
            remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False)
        )
        layer3 = nn.Linear(5, 3, False)
        # Start with the same parameters for remote and local
        layer3.weight = remote_layer3.module_rref.to_here().weight

        layer4 = nn.Linear(3, 1).cuda(self.rank)
        ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank])

        # Run local case.
        inputs = torch.rand((10, 10))
        loss = ddp_layer4(
            layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda(self.rank)
        ).sum()
        loss.backward()

        # Run remote case.
        with dist_autograd.context() as context_id:
            loss = ddp_layer4(
                remote_layer3(
                    ddp_layer2(remote_layer1(inputs).cuda(self.rank)).cpu()
                ).cuda(self.rank)
            ).sum()
            dist_autograd.backward(context_id, [loss])
            grads_dict = dist_autograd.get_gradients(context_id)
            dist.barrier()
            self.assertEqual(
                layer1.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    CommonDdpComparisonTest.get_remote_grads,
                    args=(remote_layer1.module_rref, context_id),
                ),
            )
            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
            self.assertEqual(
                layer3.weight.grad,
                rpc.rpc_sync(
                    "worker0",
                    CommonDdpComparisonTest.get_remote_grads,
                    args=(remote_layer3.module_rref, context_id),
                ),
            )
            self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
예제 #23
0
 def get_dist_gradients(self, cid):
     return dist_autograd.get_gradients(cid)