Exemplo n.º 1
0
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)
        _memory_utility.pack_params(params, itemsize, 'grad',
                                    self.gpu_buffer_a)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(),
                                    self.gpu_buffer_b.ptr(), n_elems_total,
                                    nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0,
                                    stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            _communication_utility.inter_allreduce_gpu(
                self.inter_mpi_comm, self.size, self.gpu_buffer_a,
                self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node,
                n_bytes_per_node, stream)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total,
                                   nccl.NCCL_FLOAT, 0, stream.ptr)

        _memory_utility.unpack_params(params, itemsize, 'grad',
                                      self.gpu_buffer_b)
    def multi_node_mean_grad(self, model, zero_fill=False):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype, zero_fill)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self._check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(),
                                           self.gpu_buffer_b.ptr(),
                                           n_elems_per_node_1d,
                                           nccl.NCCL_FLOAT, nccl.NCCL_SUM,
                                           stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size, self.gpu_buffer_a,
            self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(),
                                       self.gpu_buffer_a.ptr(),
                                       n_elems_per_node_1d, nccl.NCCL_FLOAT,
                                       stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a,
                                      allreduce_grad_dtype, zero_fill)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(
            params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(),
            n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size,
            self.gpu_buffer_a, self.gpu_buffer_b,
            n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(
            self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(),
            n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(self.gpu_buffer_a.array(n_elems_total))

        _memory_utility.unpack_params(
            params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
Exemplo n.º 4
0
    def allreduce_grad(self, model, zero_fill=False):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype, zero_fill, stream)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(),
                                    self.gpu_buffer_b.ptr(), n_elems_total,
                                    nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0,
                                    stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            _communication_utility.inter_allreduce_gpu(
                self.inter_mpi_comm, self.size, self.gpu_buffer_a,
                self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node,
                n_bytes_per_node, stream)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total,
                                   nccl.NCCL_FLOAT, 0, stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b,
                                      allreduce_grad_dtype, zero_fill, stream)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(),
                                           self.gpu_buffer_b.ptr(),
                                           n_elems_per_node_1d,
                                           nccl.NCCL_FLOAT, nccl.NCCL_SUM,
                                           stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size, self.gpu_buffer_a,
            self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(),
                                       self.gpu_buffer_a.ptr(),
                                       n_elems_per_node_1d, nccl.NCCL_FLOAT,
                                       stream.ptr)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a,
                                      allreduce_grad_dtype)
Exemplo n.º 6
0
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        _memory_utility.pack_params(
            params, itemsize, 'grad', self.gpu_buffer_a)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(),
            n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size,
            self.gpu_buffer_a, self.gpu_buffer_b,
            n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(
            self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(),
            n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr)

        _memory_utility.unpack_params(
            params, itemsize, 'grad', self.gpu_buffer_a)
Exemplo n.º 7
0
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        _memory_utility.pack_params(
            params, itemsize, 'grad', self.gpu_buffer_a)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            _communication_utility.inter_allreduce_gpu(
                self.inter_mpi_comm, self.size,
                self.gpu_buffer_a, self.gpu_buffer_b,
                n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(
            self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0,
            stream.ptr)

        _memory_utility.unpack_params(
            params, itemsize, 'grad', self.gpu_buffer_b)