Exemplo n.º 1
0
def broadcast_parameters(params, root_rank):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `model.state_dict()`,
    `model.named_parameters()`, or `model.parameters()`.
    Arguments:
        params: One of the following:
            - list of parameters to broadcast
            - dict of parameters to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    if isinstance(params, dict):
        params = sorted(params.items())
    elif isinstance(params, list):
        # support both named_parameters() and regular parameters()
        params = [p if isinstance(p, tuple) else (None, p) for p in params]
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run synchronous broadcasts.
    for name, p in params:
        # Broadcast is implemented as push + pull in BytePS
        # To make it a real broadcast, we set the non-root tensors all 0.
        if rank() != root_rank:
            p.fill_(0)
        # Remember to disable averaging because we are doing broadcast
        if name:
            handle = byteps_push_pull(p,
                                      average=False,
                                      name="Parameter." + name)
        else:
            handle = byteps_push_pull(p, average=False)
        synchronize(handle)
Exemplo n.º 2
0
    def _push_pull_grad_async(self, p):
        name = self._parameter_names.get(p)
        tensor = p.grad
        tensor_compressed, ctx = self._compression.compress(tensor)

        handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name)
        return handle, ctx
Exemplo n.º 3
0
    def step(self, closure=None):
        if self._enable_async:
            old_weight_map = {}
            # store the weights before update
            for p, _ in self._handles.items():
                old_weight_map[p] = p.data.clone().detach()
            # update
            loss = super(self.__class__, self).step(closure)

            for p, (h, _) in self._handles.items():
                # get the diff for each weight (in-place)
                p.data.sub_(old_weight_map.get(p))
                if h is None:
                    # create the handler now
                    if self._is_tensor_instance:
                        name = self._parameter_names.get(p.__hash__())
                    else:
                        name = self._parameter_names.get(p)
                    handle = byteps_push_pull(p,
                                              average=False,
                                              name="AsyncParam." + name)
                    _, ctx = self._compression.compress(p)
                    self._handles[p] = (handle, ctx)

            self.synchronize()
            return loss
        else:
            self.synchronize()
            return super(self.__class__, self).step(closure)
Exemplo n.º 4
0
    def synchronize(self):
        # missing_p = self._requires_update - set(self._handles.keys())
        # for p in missing_p:
        #     handle, ctx = self._push_pull_grad_async(p)
        #     self._handles[p] = (handle, ctx)

        # for p, value in self._handles.items():
        #     handle, ctx = value
        #     if handle is None:
        #         handle, ctx = self._push_pull_grad_async(p)
        #         self._handles[p] = (handle, ctx)
        # for p, (handle, _) in self._handles.items():
        #     output = synchronize(handle)
        #     self._push_pull_delay[p] = self.backward_passes_per_step
        #     if not self._enable_async:
        #         p.grad.set_(self._compression.decompress(output, ctx))
        # handle, ctx = self._push_pull_grad_async(self.whole_gradient)
        print("=======================================")
        print("Length of self.whole_gradient: %d" % len(self.whole_gradient))
        tensor_compressed, ctx = self._compression.compress(self.whole_gradient)
        handle = byteps_push_pull(tensor_compressed, average=True, name="Whole.Gradient")
        output = synchronize(handle)
        output = self._compression.decompress(output, ctx)
        # for param_group in self.param_groups:
        #     for p in param_group['params']:
        #         if p.requires_grad:
        for name, p in self._named_parameters.items():
            d_p = self._pull_from_buffer(name).view(p.data.shape)
            p.grad.set_(d_p)
        self.whole_gradient = torch.tensor([]).cuda()
        self._merged_parameter_offsets = {}
        self._merged_parameter_index = {}
        self._handles.clear()
Exemplo n.º 5
0
 def _push_pull_grad_async(self, p):
     if self._is_tensor_instance:
         name = self._parameter_names.get(p.__hash__())
     else:
         name = self._parameter_names.get(p)
     if self._enable_async:
         tensor = p
         _, ctx = self._compression.compress(tensor)
         handle = byteps_push_pull(p,
                                   average=False,
                                   name="AsyncParam." + name)
     else:
         tensor = p.grad
         tensor_compressed, ctx = self._compression.compress(tensor)
         handle = byteps_push_pull(tensor_compressed,
                                   average=True,
                                   name="Gradient." + name)
     return handle, ctx
Exemplo n.º 6
0
 def _push_pull_grad_async(self, p):
     if self._is_tensor_instance:
         name = self._parameter_names.get(p.__hash__())
     else:
         name = self._parameter_names.get(p)
     if self._enable_async:
         # the real handle will be created in step()
         handle, ctx = None, None
     else:
         tensor = p.grad
         tensor_compressed, ctx = self._compression.compress(tensor)
         handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name)
     return handle, ctx
Exemplo n.º 7
0
 def _push_pull_grad_async(self, p):
     if self._is_tensor_instance:
         name = self._parameter_names.get(p.__hash__())
         fp16_p = self._fp32_to_fp16_map.get(p.__hash__())
     else:
         name = self._parameter_names.get(p)
         fp16_p = self._fp32_to_fp16_map.get(p)
     tensor = fp16_p.grad
     tensor_compressed, ctx = self._compression.compress(tensor)
     if fp16_p not in self.priorities:
         self.priorities[fp16_p] = self.gradient_count
         self.gradient_count += 1
     handle = byteps_push_pull(tensor_compressed, average=False, name="Gradient."+name, priority=self.priorities[fp16_p])
     return handle, ctx
Exemplo n.º 8
0
    def _push_pull_grad_async(self, p):
        """Call byteps API to push-pull gradient asynchronously
        Arguments:
            tensor: The tensor to push-pull.
            name: The name of the tensor.
        Returns:
            an push-pull handle and context
        """
        name = self._parameter_names.get(p)
        tensor = p.grad
        tensor_compressed, ctx = self._compression.compress(tensor)

        self._locks[p].acquire()
        handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name)
        self._logger.debug("{} calls byteps_push_pull for {}".format(self._desc, self._parameter_names[p]))
        # Add to queue to poll completion
        self._event_queue.put((p, handle, ctx))
        return handle, ctx