Exemplo n.º 1
0
            def reduce(*_):
                # Skip gradient reduction, do not change status information
                if self._grad_reduced[index]:
                    assert param.grad is not None, "Parameter gradient cannot be None"

                    # Change reduce information
                    self._grad_reduced[index] = False

                    # Clear the gradient that does not belong to the current rank through the callback function
                    def cleanup():
                        if dst_rank != self._rank:
                            param.clear_gradient(False)
                        elif self._offload:
                            self._sharding_optimizers[0]._offload_acc_grad(
                                param.name,
                                param.grad.cast(dtype=Type.fp32.value).cpu())
                            param.clear_gradient(False)

                    # Synchronize the reduce parameter gradient
                    self._tasks_flow.append(
                        Taskflow(task=dist.reduce(
                            tensor=param.grad,
                            dst=self._group.ranks[dst_rank],
                            group=self._group,
                            use_calc_stream=True),
                                 callback=cleanup))

                    # Multi stream operation will be supported later
                    dist.wait(tensor=param.grad,
                              group=self._group,
                              use_calc_stream=True)

                    # Clear the task flow and trigger callback to clear the redundant gradient
                    self._clear_task_flow()
Exemplo n.º 2
0
        def allreduce_(*_):
            if param.name in self._task_flow.full_grad.keys():
                full_grad = self._task_flow.full_grad[param.name]
                # Only support sync allreduce current rank's layer now
                dist.all_reduce(tensor=full_grad,
                                group=self._group,
                                use_calc_stream=True)
                dist.wait(tensor=full_grad,
                          group=self._group,
                          use_calc_stream=True)

                start, end = self._param2buffer[param.name][self._rank]
                if param.bw_storage is None:
                    param.bw_storage = core.VarBase(
                        full_grad._slice(start, end)).detach().clone()
                    if self._offload:
                        param.bw_storage = _device2cpu(param.bw_storage, True)
                else:
                    if self._offload:
                        cpu_grad = _device2cpu(
                            core.VarBase(full_grad._slice(
                                start, end)).detach().clone(), True)
                        with device_guard(device="cpu"):
                            param.bw_storage = paddle.add(
                                param.bw_storage, cpu_grad)
                    else:
                        # param.bw_storage.add_(
                        #     core.VarBase(full_grad._slice(start, end))
                        #     .detach().clone())
                        param.bw_storage = paddle.add(
                            param.bw_storage,
                            core.VarBase(full_grad._slice(
                                start, end)).detach().clone())
                param.clear_gradient(False)
                param._gradient_set_empty(False)
                tmp_var = self._task_flow.full_grad.pop(param.name)
                tmp_var._clear()

            if param.name in self._task_flow.full_param.keys():
                if param.status == "all":
                    param.use_count = 0
                    param._clear()
                    start, end = self._param2buffer[param.name][self._rank]
                    param.fw_storage = core.VarBase(
                        self._task_flow.full_param[param.name]._slice(
                            start, end),
                        param.name + "@slice").detach().clone()
                    param.status = "part"
                    tmp_var = self._task_flow.full_param.pop(param.name)
                    tmp_var._clear()

                    if self._offload:
                        param.fw_storage._clear()
                        param.master_weight._share_buffer_to(param.fw_storage)
Exemplo n.º 3
0
            def reduce(*_):
                # Skip gradient reduction, do not change status information
                if self._grad_reduced[index]:
                    assert param.grad is not None, "Parameter gradient cannot be None"

                    # Change reduce information
                    self._grad_reduced[index] = False
                    grad_storage = self._grad_storages[param.dtype][dst_rank]
                    grad_storage.params_checked_in += 1

                    if grad_storage.all_checked_in:
                        assert grad_storage.buffer is not None

                        # Clearing up the grad_storage buffer
                        def cleanup():
                            if dst_rank != self._rank:
                                for p in grad_storage._params:
                                    p.clear_gradient(False)
                                    p._gradient_set_empty(False)

                                grad_storage.buffer.value().get_tensor(
                                )._clear()
                            elif self._offload:
                                grad_storage.to(device=self._offload_device)
                                for p in grad_storage._params:
                                    self._sharding_optimizers[
                                        0]._offload_acc_grad(
                                            p.name,
                                            p.grad.cast(dtype=Type.fp32.value))
                                    p.clear_gradient(False)
                                    p._gradient_set_empty(False)
                                grad_storage._device = self._default_device
                                grad_storage.buffer.value().get_tensor(
                                )._clear()

                        # Reduce the bucket
                        grad_storage.sent = True
                        self._tasks_flow.append(
                            Taskflow(task=dist.reduce(
                                tensor=grad_storage.buffer,
                                dst=self._group.ranks[
                                    grad_storage.destination],
                                group=self._group,
                                use_calc_stream=True),
                                     callback=cleanup))

                        # Multi stream operation will be supported later
                        dist.wait(tensor=grad_storage.buffer,
                                  group=self._group,
                                  use_calc_stream=True)

                    # Clear the task flow and trigger callback to clear the redundant gradient
                    self._clear_task_flow()
Exemplo n.º 4
0
    def __sync_buffers(self):
        """
        Sync all the param buffers from all ranks (exp: batch norm statistics).
        """

        for buffer in self._layer.buffers(include_sublayers=True):
            dist.broadcast(buffer,
                           self._global_root_rank,
                           self._group,
                           use_calc_stream=True)
        # Multi stream operation will be supported later
        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
    def _sync_params_and_buffers(self):
        """
        Sync all model states for all ranks
        """

        for p in self._local_params:
            broadcast(p,
                      src=self._global_root_rank,
                      group=self.group,
                      use_calc_stream=True)

        # Multi stream operation will be supported later
        wait(tensor=p, group=self.group, use_calc_stream=True)
Exemplo n.º 6
0
    def _update_params(self):
        """
        Update parameters to optimizer memory slice.
        """
        update_list = []
        assert len(self._trainable_params.keys()) > 0
        current_layer_params = self._layer.parameters(include_sublayers=True)
        trainable_params = list(
            filter(lambda p: p.trainable and p not in self._unslice_params,
                   current_layer_params))
        # 1.Handle param's slice
        for param in trainable_params:
            assert hasattr(param, "fw_storage"
                           ), "Find {} don't have fw_storage attribute".format(
                               param.name)
            # Gradient average
            if self._offload:
                with device_guard(device="cpu"):
                    param.bw_storage.scale_(scale=self._world_size_scaling)
            else:
                param.bw_storage.scale_(scale=self._world_size_scaling)
            param.fw_storage = _VarBaseWrapper(param)
            assert param.fw_storage.grad is None
            param.fw_storage._copy_gradient_from(param.bw_storage)
            update_list.append(param)

        # 2.Handle unslice param
        for grad_storage in self._grad_storages.values():
            grad_storage.buffer.scale_(scale=self._world_size_scaling)
            dist.all_reduce(tensor=grad_storage.buffer,
                            group=self._group,
                            use_calc_stream=True)
            dist.wait(tensor=grad_storage.buffer,
                      group=self._group,
                      use_calc_stream=True)

        if self._offload:
            for param in list(self._unslice_params):
                param._clear()
                param.master_weight._share_buffer_to(param)

            for grad_storage in self._grad_storages.values():
                for p in grad_storage._params:
                    tmp_g = _device2cpu(p.grad, convert_dtype=True)
                    p.clear_gradient(False)
                    p._gradient_set_empty(False)
                    p._copy_gradient_from(tmp_g)
                    tmp_g._clear()
                grad_storage.buffer._clear()

        return update_list
    def _broadcast_params(self):
        """Broadcast the parameters of the current rank to each rank"""

        assert self._default_device == "gpu", "Only supported gpu"

        # Exchange all the shards with the other ranks
        for dtype_per_rank in self.param_storages.values():
            for dst_rank, internal_storage in dtype_per_rank.items():
                broadcast(tensor=internal_storage.buffer,
                          src=self.group.ranks[dst_rank],
                          group=self.group,
                          use_calc_stream=True)

            # Multi stream operation will be supported later
            wait(tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True)
Exemplo n.º 8
0
def _allgather_buffer(trainable_params,
                      group,
                      use_calc_stream,
                      task_flow,
                      sync_wait=False,
                      offload=False,
                      convert2cpu=False):

    for param in trainable_params:
        if param.status == "all":
            param.use_count += 1
            continue

        if offload:
            param.fw_storage = _cpu2device(param)

        with paddle.amp.auto_cast(enable=False):
            full_param = _all_gather(param.fw_storage,
                                     group,
                                     use_calc_stream=use_calc_stream)

        # Allgather current layer in the 1st step synchronously
        if sync_wait:
            with paddle.amp.auto_cast(enable=False):
                dist.wait(tensor=full_param,
                          group=group,
                          use_calc_stream=use_calc_stream)
            core.VarBase(full_param._slice(
                0, param._numel()))._share_buffer_to(param)
            param.fw_storage._clear()
            param.fw_storage = None
            param.status = "all"
            param.use_count += 1
        task_flow.full_param[param.name] = full_param

        # parameter converts to cpu
        if convert2cpu:
            p_name = param.name
            param = _device2cpu(param)
            tmp_var = task_flow.full_param.pop(p_name)
            tmp_var._clear()
            task_flow.full_param[p_name] = param

    return task_flow