示例#1
0
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
    Generate code for a softmax.

    On entry, `buf` and `buf2` must contain two identical copies of
    the input to softmax.

    After the code returns `buf` contains the softmax, `buf2` contains
    un-normalized softmax.

    Parameters
    ----------
    N
        Length of the buffer.
    threadPos
        Index of executing thread.
    threadCount
        Number of executing threads.
    dtype
        Dtype of the softmax's output.

    Notes
    -----
    `buf` and `buf2` should be in gpu shared memory, we access it many
    times.

    We use __i as an int variable in a loop.

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
    # get max of buf (trashing all but buf[0])
    return [inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
            '; __i+=' + threadCount + '){',
            buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
            buf2 + '[__i] = ' + buf + '[__i]',
            '}',
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
            ('%s row_sum = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
            '; __i+=' + threadCount + '){',
            buf + '[__i] = ' + buf2 + '[__i] / row_sum',
            '}',
            '__syncthreads()',
            ]
    def gpu_kernels(self, node, name):
        dt = node.inputs[0].type
        code = """
KERNEL void doublek(GLOBAL_MEM %(ctype) *out,
                   GLOBAL_MEM const %(ctype)s *a,
                   ga_size n) {
  for (ga_size i = LID_0; i < n; i += LDIM_0) {
    out[i] = 2 * a[i];
  }
}
""" % dict(ctype=gpuarray.dtype_to_ctype(dt))
        return [Kernel(code=code, name="doublek",
                       params=[gpuarray.GpuArray,
                               gpuarray.GpuArray,
                               gpuarray.SIZE],
                       flags=Kernel.get_flags(dt))]
示例#3
0
文件: sort.py 项目: gvtulder/Theano
    def gpu_kernels(self, node, nodename):
        # load kernel source
        device_type = node.inputs[0].type.context.kind
        kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
        common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]

        # prepare "$" macros
        if device_type == b'cuda':
            ndim = node.inputs[0].ndim
            dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim))
            dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim))
            src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim))
            set_slice_code = '''
        gidx = gid %% dims_%(i)d;
        gid /= dims_%(i)d;
        {dstv};
        {dsti};
        src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
                dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
                dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
            set_slice_code = ''.join(
                set_slice_code % dict(i=j) for j in range(1, ndim))
            flags = Kernel.get_flags(node.inputs[0].dtype)
            subs = dict(
                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
                out_t=ga.dtype_to_ctype(self.idx_dtype),
                dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
                dstv='INPUT_TYPE *dstv,' if self.return_values else '',
                dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
                dstv_strides=dstv_strides_code if self.return_values else '',
                dsti_strides=dsti_strides_code if self.return_indices else '',
                src_strides=src_strides_code,
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
                ndim=str(ndim),
                use_half=int(node.inputs[0].dtype == 'float16')
                )
        elif device_type == b'opencl':
            raise NotImplementedError()

        # setup parameters
        param_types = [ga.SIZE] * (ndim - 1)  # dims
        for _ in range(self.return_values + self.return_indices):
            param_types.append(ga.GpuArray)  # dst*
            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
        param_types.append(ga.SIZE)  # size

        # load and compile kernels
        with open(os.path.join(
            os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext
        )) as f:
            common_src = f.read()

        kernels = []

        def build_kernel(fname, kname, subs):
            with open(os.path.join(
                os.path.dirname(__file__), 'c_code', fname)
            ) as f:
                kernel_src = f.read()
            ker = Kernel(
                code=Template(common_src + kernel_src).substitute(**subs),
                name=kname,
                params=param_types,
                flags=flags,
                objvar=kname + nodename)
            return ker

        subs['count_t'] = 'int'
        kernels.append(
            build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
        subs['kname'] = 'k_topk_dense_large'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
        subs['count_t'] = 'long long'
        subs['kname'] = 'k_topk_dense_xlarge'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
        return kernels
示例#4
0
def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                sm, sm_stride, write_sm,
                                threadPos, threadCount,
                                b='', stride_b='', load_b='',
                                dtype="float32"):
    """
    Generate code to perform softmax with a fixed amount of shared
    memory.

    On entry, `buf` is assumed to be empty.

    On exit, `buf[0]` contains the softmax, `buf2` contains
    un-normalized softmax.

    Parameters
    ----------
    N
        Length of the buffer, atleast waprSize(32).
    buf
        A shared memory buffer of size warpSize * sizeof(dtype).
    x
        A ptr to the gpu memory where the row is stored.
    stride_x
        The stride between each element in x.
    load_x
        Wrapper to read from x.
    sm
        A ptr to the gpu memory to store the result.
    sm_stride
        The stride between each sm element.
    write_sm
        Wrapper before writing to sm.
    threadPos
        Index of executing thread.
    threadCount
        Number of executing threads.
    b
        Optional, pointer to the bias.
    stride_b
        Optional, the stride of b if b is provided.
    load_b
        Optional, wrapper to read from b if b is provided.
    dtype
        Optional, the dtype of the softmax's output if not float32.

    Notes
    -----
    `buf` should be in gpu shared memory, we access it many times.

    We use tx as an int variable in a loop.

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
    ret = [
        # get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
                                       threadPos, threadCount,
                                       b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
        ('%s row_max = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
                                   threadPos, threadCount,
                                   lambda a, b: "%s + %s" % (a, b),
                                   lambda a: "exp(%s - row_max)" % a,
                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
        ('%s row_sum = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
        ]
    # This set all value correctly
    if b:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
            "  %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
            "            %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
            " / row_sum)" % locals()]
    else:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
            "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
            " / row_sum)" % locals()]
    ret += [
        "}",
        '__syncthreads()',
    ]
    return ret
示例#5
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        if node.inputs[0].type.context.kind != b'cuda':
            f = ''
        else:
            f = '' if dtype_x == 'float64' else 'f'
        params = [
            gpuarray.SIZE, gpuarray.SIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
        ]
        sio = StringIO()
        print("""#include "cluda.h"

        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
            GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
            GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
            GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
            GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
        {
          x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
          b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
          y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
          nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
          sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
          am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
          for (ga_int row = GID_0; row < M; row += GDIM_0){
            GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
            GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
            GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM ga_int row_max_threadIdx;
            %(work_x)s per_thread_row_max, per_thread_sum;
            ga_int per_thread_row_max_j;
            // COMPUTE ROW MAX AND ARGMAX
            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[LID_0] = per_thread_row_max;
            local_barrier();
            if (LID_0 == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (ga_int j = 0; j < LDIM_0; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }
            local_barrier();
            // The thread with the highest max writes out which of its
            // values was the winner.
            if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }
            per_thread_values[LID_0] = per_thread_sum;
            local_barrier();
            if (LID_0 == 0) {
              sum = 0.0;
              for (ga_int j = 0; j < LDIM_0; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }
            local_barrier();
            for (ga_int j = LID_0; j < N; j += LDIM_0) {
              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
            }
            if (LID_0 == 0) {
              const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
              if ((y_idx >= N || y_idx < 0)) {
                // raise some suspicion.
                nll_data[row * nlls0] = %(write_x)s(0.0);
              } else {
                nll_data[row * nlls0] = %(write_x)s(
                   - %(load_x)s(x[y_idx * xs1])
                   - %(load_b)s(b[y_idx * bs0])
                   + row_max + log%(f)s(sum));
              }
            }
          }
        }
        """ % locals(), file=sio)

        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#6
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      GLOBAL_MEM %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      GLOBAL_MEM %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const ga_int set_instead_of_inc,
                                      GLOBAL_MEM ga_int *err)
        {
             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
                   tc=np.dtype(dtype_x).char)
        from pygpu.gpuarray import SIZE, SSIZE
        params = [
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
            gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#7
0
    def c_code(self, node, name, inputs, outputs, sub):
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)

        #check that all inputs have valid dimensions
        emitted_inames = {}
        code = """
        int n_blocks = 0;
        int threads_per_block = 0;
        size_t numEls = 0;
        """
        if nd > 0:
            code += """
            size_t dims[%(nd)s] = {%(initial_dims)s};
            """ % locals()
        else:
            code += """
            size_t *dims = NULL;
            """
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[idx]
                continue

            broadcasts = map(int, node.inputs[idx].broadcastable)
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[idx].ndim
            if nd > 0:
                code += """
                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
                """ % locals()
            else:
                code += """
                int *broadcasts_%(iname)s = NULL;
                """ % locals()
            emitted_inames[iname] = node.inputs[idx]

        #check that all inputs have valid dimensions
        emitted_inames = {}
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                continue
            code += """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
                         "need %(nd)s dims, not %%i",
                         PyGpuArray_NDIM(%(iname)s));
            %(fail)s;
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
            if ((!(broadcasts_%(iname)s[i] &&
                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(iname)s)[i],
                             dims[i]
                            );
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            typecode = dtype_to_typecode(node.outputs[idx].dtype)
            if idx not in self.inplace_pattern.keys():
                code += """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
                            pygpu_default_context(), Py_None);
            if (!%(oname)s) {
                        //TODO, this check don't seam good.
                        //TODO, set exception?
                            %(fail)s
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                code += """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);

        //first use at least a full warp
        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE

        //next start adding multiprocessors
        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
        n_blocks = std::min(numEls/threads_per_block +
                               (numEls %% threads_per_block?1:0),
                           (size_t)30);

        // next start adding more warps per multiprocessor
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);

                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
        param = []
        for i in range(nd):
            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
                                                              i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
        code += ',\n'.join(param) + ");\n"
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
        return str(code)
示例#8
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
/*
 * This is an atomicAdd that works for doubles since that is not provided
 * natively by cuda before arch 6.0.
 */
#if __CUDA_ARCH__ < 600
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
    ga_ulong *address_as_ull = (ga_ulong *)address;
    ga_ulong old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif

__device__ ga_double atomicExch(ga_double *address, ga_double val) {
    return atomicExch((ga_ulong *)address,
                      __double_as_longlong(val));
}

/* GA_LONG */

__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
    ga_ulong *address_as_ull = (ga_ulong *)address;
    ga_ulong old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        (ga_ulong)(val + (ga_long)assumed));
    } while (assumed != old);
    return (ga_long)old;
}

__device__ ga_long atomicExch(ga_long *address, ga_long val) {
    return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
}


/* GA_HALF */

/*
 * This may read and write 2 bytes more than the size of the array
 * if the array has an uneven number of elements.  The actual value
 * at that spot will not be modified.
 */
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
      __half2float((ga_half)__byte_perm(old, 0,
                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, new_;
  old = *base;
  do {
    assumed = old;
    new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const int set_instead_of_inc,
                                      ga_int *err)
        {
             X = (%(type_x)s *)(((char *)X)+offset_X);
             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
        params = [
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int',
            gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#9
0
文件: nnet.py 项目: yyq90/grammarVAE
 def gpu_kernels(self, node, nodename):
     dtype_x = node.inputs[0].dtype
     dtype_b = node.inputs[1].dtype
     dtype_sm = node.outputs[0].dtype
     load_x = load_w(node.inputs[0].dtype)
     load_b = load_w(node.inputs[1].dtype)
     write_sm = write_w(node.outputs[0].dtype)
     work_sm = work_dtype(node.outputs[0].dtype)
     flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
     type_x = gpuarray.dtype_to_ctype(dtype_x)
     type_b = gpuarray.dtype_to_ctype(dtype_b)
     type_sm = gpuarray.dtype_to_ctype(dtype_sm)
     type_acc = gpuarray.dtype_to_ctype(work_sm)
     params = [
         'uintp', 'uintp',
         gpuarray.GpuArray, 'uintp', 'intp', 'intp',
         gpuarray.GpuArray, 'uintp', 'intp',
         gpuarray.GpuArray, 'uintp', 'intp', 'intp'
         ]
     kernels = []
     kname = "kSoftmaxWithBias"
     k_var = "kSoftmaxWithBias_" + nodename
     code = nvcc_kernel(
         kname,
         params=['const ga_size M', 'const ga_size N',
                 'const %s * x' % type_x, 'const ga_size offset_x',
                 'const ga_ssize sx0', 'const ga_ssize sx1',
                 'const %s * b' % type_b, 'const ga_size offset_b',
                 'const ga_ssize sb0',
                 '%s * sm' % type_sm, 'const ga_size offset_sm',
                 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
         body=["extern __shared__ %s buf[]" % type_acc,
               "%s * buf2 = buf + N" % type_acc,
               "x = (const %s *)(((char *)x)+offset_x)" % type_x,
               "b = (const %s *)(((char *)b)+offset_b)" % type_b,
               "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
               "for (int blockIDX = blockIdx.x; blockIDX < M;"
               "     blockIDX += gridDim.x){",
               "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
               "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
               "buf[tx] += %s(b[tx * sb0])" % load_b,
               "buf2[tx] = buf[tx]",
               "}",
               "__syncthreads()",
               inline_softmax('N', 'buf', 'buf2',
                              'threadIdx.x', 'blockDim.x', work_sm),
               "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
               "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
               "}",
               "__syncthreads()",
               "}",
               ])
     kernels.append(Kernel(code=code, name=kname, params=params,
                           flags=flags, objvar=k_var))
     kname = "kSoftmaxWithBias_fixed_shared"
     k_var = "kSoftmaxWithBias_fixed_shared" + nodename
     code = nvcc_kernel(
         kname,
         params=['const ga_size M', 'const ga_size N',
                 'const %s * x' % type_x, 'const ga_size offset_x',
                 'const ga_ssize sx0', 'const ga_ssize sx1',
                 'const %s * b' % type_b, 'const ga_size offset_b',
                 'const ga_ssize sb0',
                 '%s * sm' % type_sm, 'const ga_size offset_sm',
                 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
         body=["extern __shared__ %s buf[]" % type_acc,
               "x = (const %s *)(((char *)x)+offset_x)" % type_x,
               "b = (const %s *)(((char *)b)+offset_b)" % type_b,
               "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
               "for (int blockIDX = blockIdx.x; blockIDX < M;"
               "     blockIDX += gridDim.x){",
               "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
               "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
               inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                           load_x,
                                           'sm_ptr', 'sm_s1', write_sm,
                                           'threadIdx.x', 'blockDim.x',
                                           'b', 'sb0', load_b, work_sm),
               "__syncthreads()",
               "}",
               ])
     kernels.append(Kernel(code=code, name=kname, params=params,
                           flags=flags, objvar=k_var))
     return kernels
示例#10
0
文件: nnet.py 项目: yyq90/grammarVAE
    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        work_dnll = work_dtype(dtype_dnll)
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
        type_dnll = gpuarray.dtype_to_ctype(work_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
        kname = "kCrossEntropySoftmax1HotWithBiasDx"
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           const %(type_dnll)s* dnll, const ga_size offset_dnll,
           const ga_ssize dnll_s0,
           const %(type_sm)s* sm, const ga_size offset_sm,
           const ga_ssize sm_s0, const ga_ssize sm_s1,
           const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
           const ga_ssize y_idx_s0,
           %(type_dx)s* dx, const ga_size offset_dx,
           const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
            dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
            sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
            y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
            dx = (%(type_dx)s *)(((char *)dx)+offset_dx);

            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
                %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];

                for (int j = threadIdx.x; j < K; j += blockDim.x)
                {
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
                }
            }
        }
        """ % locals(), file=sio)
        params = [
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
            ]
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#11
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
        load_x = load_w(node.inputs[0].dtype)
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_acc = gpuarray.dtype_to_ctype(work_sm)

        ctype = gpuarray.dtype_to_ctype(work_sm)

        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
        ]
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
                    buf[tx] += %(load_b)s(b[tx * sb0]);
                    buf2[tx] = buf[tx];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
                    buf[__i] = exp(buf2[__i] - row_max);
                    buf2[__i] = buf[__i];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
                    buf[__i] = buf2[__i] / row_sum;
                }
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
                }
                local_barrier();
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
                GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                        red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                    red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
                }
                local_barrier();
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels
示例#12
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        if node.inputs[0].type.context.kind != b"cuda":
            f = ""
        else:
            f = "" if dtype_x == "float64" else "f"
        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
        ]
        sio = StringIO()
        print(
            """#include "cluda.h"

        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
            GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0,
            GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0,
            GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1,
            GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values))
        {
          x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x);
          b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
          y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx);
          nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll);
          sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm);
          am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am);
          for (ga_int row = GID_0; row < M; row += GDIM_0){
            GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row;
            GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row;
            GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values);
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM ga_int row_max_threadIdx;
            %(work_x)s per_thread_row_max, per_thread_sum;
            ga_int per_thread_row_max_j;
            // COMPUTE ROW MAX AND ARGMAX
            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[LID_0] = per_thread_row_max;
            local_barrier();
            if (LID_0 == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (ga_int j = 0; j < LDIM_0; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }
            local_barrier();
            // The thread with the highest max writes out which of its
            // values was the winner.
            if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;
            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (ga_int j = LID_0; j < N; j += LDIM_0)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }
            per_thread_values[LID_0] = per_thread_sum;
            local_barrier();
            if (LID_0 == 0) {
              sum = 0.0;
              for (ga_int j = 0; j < LDIM_0; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }
            local_barrier();
            for (ga_int j = LID_0; j < N; j += LDIM_0) {
              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
            }
            if (LID_0 == 0) {
              const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0];
              if ((y_idx >= N || y_idx < 0)) {
                // raise some suspicion.
                nll_data[row * nlls0] = %(write_x)s(0.0);
              } else {
                nll_data[row * nlls0] = %(write_x)s(
                   - %(load_x)s(x[y_idx * xs1])
                   - %(load_b)s(b[y_idx * bs0])
                   + row_max + log%(f)s(sum));
              }
            }
          }
        }
        """ % locals(),
            file=sio,
        )

        return [
            Kernel(
                code=sio.getvalue(),
                name=kname,
                params=params,
                flags=flags,
                objvar=k_var,
            )
        ]
示例#13
0
    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        work_dnll = work_dtype(dtype_dnll)
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
        wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
        type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
        kname = "kCrossEntropySoftmax1HotWithBiasDx"
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        params = [
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
        ]
        sio = StringIO()
        print(
            """#include "cluda.h"

        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
           GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
           GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
           GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
            dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
            sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
            dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
            for (ga_int i = GID_0; i < N; i += GDIM_0)
            {
                %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
                for (ga_int j = LID_0; j < K; j += LDIM_0)
                {
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
                }
            }
        }
        """ % locals(),
            file=sio,
        )
        return [
            Kernel(
                code=sio.getvalue(),
                name=kname,
                params=params,
                flags=flags,
                objvar=k_var,
            )
        ]
示例#14
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(work_x)
        type_b = gpuarray.dtype_to_ctype(work_b)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
            %(type_x)s* sm = sm_data + sms0 * row;

            %(type_x)s sum = 0.0;
            int row_max_j = 0;
            %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                %(type_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
                sm[j * sms1] = %(write_x)s(sm_ij);
            }
            %(type_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
                __tmp *= sum_inv;
                sm[j * sms1] = %(write_x)s(__tmp);
            }
            if ((y_idx >= N) || (y_idx < 0))
            {
                //TODO: set raise an error bit in a global var?
                nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
            }
            else
            {
                nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
                           - %(load_b)s(b[y_idx*bs0])
                           + row_max
                           + log(sum));
            }
            am_data[row*ams0] = row_max_j;
          }
        }
        """ % locals(),
              file=sio)
        params = [
            'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp',
            'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray,
            'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp'
        ]
        return [
            Kernel(code=sio.getvalue(),
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var)
        ]
示例#15
0
文件: nnet.py 项目: Abioy/Theano
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != 'cuda':
            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
        itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        type_intp = gpuarray.dtype_to_ctype(numpy.intp)
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
        ctx = sub['params']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: %(k_var)s: %%s.",
                             GpuKernel_error(&%(k_var)s, err));
                %(fail)s;
            }
        """ % locals()
        sync = ""
        if config.gpuarray.sync:
            sync = """
            err = GpuArray_sync(&%(z)s->ga);
            %(err_check)s
            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
                                        PyGpuArray_DIMS(%(dnll)s)[0] :
                                        (ssize_t) 0);

        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
        // or a vector with just one element.
        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
                                           (ssize_t) 0);

        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
            || (PyGpuArray_NDIM(%(sm)s) != 2)
            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
        {
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_Format(PyExc_ValueError,
                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
                         %(dnll)s_dims0,
                         PyGpuArray_DIMS(%(sm)s)[0]);
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_SetString(PyExc_ValueError,
                            "dnll.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(sm)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "sm.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if ((NULL == %(dx)s)
            || (PyGpuArray_DIMS(%(dx)s)[0] !=
                PyGpuArray_DIMS(%(sm)s)[0])
            || (PyGpuArray_DIMS(%(dx)s)[1] !=
                PyGpuArray_DIMS(%(sm)s)[1]))
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
                                 %(typecode_dx)s, GA_C_ORDER,
                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
        }
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
            void *kernel_params[] = {
                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
                (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
                (void *)&stride_DNLL0,
                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
                (void *)&stride_SM0, (void *)&stride_SM1,
                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
                (void *)&stride_YIDX0,
                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
                (void *)&stride_DX0, (void *)&stride_DX1};
            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
            %(err_check)s
            %(sync)s
        }
        assert(%(dx)s);
        """ % locals()
示例#16
0
文件: nnet.py 项目: Abioy/Theano
    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        work_dnll = work_dtype(dtype_dnll)
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
        type_dnll = gpuarray.dtype_to_ctype(work_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
        kname = "kCrossEntropySoftmax1HotWithBiasDx"
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           const %(type_dnll)s* dnll, const ga_size offset_dnll,
           const ga_ssize dnll_s0,
           const %(type_sm)s* sm, const ga_size offset_sm,
           const ga_ssize sm_s0, const ga_ssize sm_s1,
           const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
           const ga_ssize y_idx_s0,
           %(type_dx)s* dx, const ga_size offset_dx,
           const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
            dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
            sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
            y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
            dx = (%(type_dx)s *)(((char *)dx)+offset_dx);

            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
                %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];

                for (int j = threadIdx.x; j < K; j += blockDim.x)
                {
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
                }
            }
        }
        """ % locals(), file=sio)
        params = [
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
            ]
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#17
0
    def gpu_kernels(self, node, nodename):
        # We can't rely on numpy for this, it changes with the OS
        CHARMAP = dict(
            int32="i",
            uint32="I",
            int64="l",
            uint64="L",
            float16="e",
            float32="f",
            float64="d",
        )
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      GLOBAL_MEM %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      GLOBAL_MEM %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const ga_int set_instead_of_inc,
                                      GLOBAL_MEM ga_int *err)
        {
             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(
            type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x]
        )
        from pygpu.gpuarray import SIZE, SSIZE

        params = [
            SIZE,
            SIZE,
            SSIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            SIZE,
            SIZE,
            SSIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            SIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            "int32",
            gpuarray.GpuArray,
        ]
        return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
示例#18
0
    def gpu_kernels(self, node, nodename):
        # load kernel source
        device_type = node.inputs[0].type.context.kind
        kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
        common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]

        # prepare "$" macros
        if device_type == b'cuda':
            ndim = node.inputs[0].ndim
            dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i
                                        for i in range(ndim))
            dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i
                                        for i in range(ndim))
            src_strides_code = ''.join('ssize_t src_strides_%d, ' % i
                                       for i in range(ndim))
            set_slice_code = '''
        gidx = gid %% dims_%(i)d;
        gid /= dims_%(i)d;
        {dstv};
        {dsti};
        src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
                dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)'
                if self.return_values else '',
                dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)'
                if self.return_indices else '')
            set_slice_code = ''.join(set_slice_code % dict(i=j)
                                     for j in range(1, ndim))
            if self.return_values:
                set_slice_code += """
                dstv = ptr_add(dstv, dstv_offset);
                """
            if self.return_indices:
                set_slice_code += """
                dsti = ptr_add(dsti, dsti_offset);
                """
            set_slice_code += """
                src = ptr_add(src, src_offset);
            """
            flags = Kernel.get_flags(node.inputs[0].dtype)
            subs = dict(
                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
                out_t=ga.dtype_to_ctype(self.idx_dtype),
                dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
                dstv='INPUT_TYPE *dstv,' if self.return_values else '',
                dstv_offset='size_t dstv_offset,'
                if self.return_values else '',
                dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
                dsti_offset='size_t dsti_offset,'
                if self.return_indices else '',
                dstv_strides=dstv_strides_code if self.return_values else '',
                dsti_strides=dsti_strides_code if self.return_indices else '',
                src_strides=src_strides_code,
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
                ndim=str(ndim))
        elif device_type == b'opencl':
            raise NotImplementedError()

        # setup parameters
        param_types = [ga.SIZE] * (ndim - 1)  # dims
        for _ in range(self.return_values + self.return_indices):
            param_types.append(ga.GpuArray)  # dst*
            param_types.append(ga.SIZE)  # offset
            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
        param_types.append(ga.SIZE)  # offset
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
        param_types.append(ga.SIZE)  # size

        # load and compile kernels
        with open(
                os.path.join(os.path.dirname(__file__), 'c_code',
                             'topk_common' + common_ext)) as f:
            common_src = f.read()

        kernels = []

        def build_kernel(fname, kname, subs):
            with open(os.path.join(os.path.dirname(__file__), 'c_code',
                                   fname)) as f:
                kernel_src = f.read()
            ker = Kernel(
                code=("#include <cluda.h>\n" +
                      Template(common_src + kernel_src).substitute(**subs)),
                name=kname,
                params=param_types,
                flags=flags,
                objvar=kname + nodename)
            return ker

        subs['count_t'] = 'int'
        kernels.append(
            build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
        subs['kname'] = 'k_topk_dense_large'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large',
                         subs))
        subs['count_t'] = 'long long'
        subs['kname'] = 'k_topk_dense_xlarge'
        kernels.append(
            build_kernel('topk_dense_large' + kernel_ext,
                         'k_topk_dense_xlarge', subs))
        return kernels
示例#19
0
 def gpu_kernels(self, node, name):
     dtype_d = node.inputs[0].dtype
     type_d = gpuarray.dtype_to_ctype(dtype_d)
     dtype_x = node.inputs[1].dtype
     type_x = gpuarray.dtype_to_ctype(dtype_x)
     dtype_y = node.outputs[0].dtype
     type_y = gpuarray.dtype_to_ctype(dtype_y)
     work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d))
     load_d = load_w(dtype_d)
     work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x))
     load_x = load_w(dtype_x)
     code = """
     #include "cluda.h"
     KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) {
         d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off);
         x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off);
         y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off);
         ga_size index = threadIdx.x + blockIdx.x * blockDim.x;
         if (index < lx) {
             ga_long a = 0;
             ga_long b = (ga_long)(ld - 1);
             %(work_d)s minval = %(load_d)s(d[a]);
             %(work_d)s maxval = %(load_d)s(d[b * stridesD0]);
             %(work_x)s val = %(load_x)s(x[index * stridesX0]);
             if (val > maxval) {
                 a = (ga_long)ld;
                 b = (ga_long)ld;
             } else if (val <= minval) {
                 a = 0;
                 b = 0;
             }
             while (b - a > 0) {
                 ga_long h = (b + a) / 2;
                 %(work_d)s t = %(load_d)s(d[h * stridesD0]);
                 if (val < t) {
                     b = h;
                 } else {
                     a = h + 1;
                 }
             }
             y[index * stridesY0] = b;
         }
     }""" % dict(type_d=type_d,
                 type_x=type_x,
                 type_y=type_y,
                 work_d=work_d,
                 load_d=load_d,
                 work_x=work_x,
                 load_x=load_x,
                 name=name)
     return [
         Kernel(code=code,
                name="binsearchsorted",
                params=[
                    gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                    gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                    gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                    gpuarray.SIZE, gpuarray.SSIZE
                ],
                flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y),
                objvar='k_binsearchsorted_' + name)
     ]
示例#20
0
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = (
            """
// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
            const int width,
            const int c,
            const int d,
            const int step_x,
            const int step_y,
            const int grid_c,
            const int grid_d,
            const size_t stride0, const size_t stride1,
            const size_t stride2, const size_t stride3,
            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
            const size_t out_s0, const size_t out_s1,
            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
                left = left/grid_c;
                const int s = left%%nb_stack;
                left = left/nb_stack;
                const int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            int i = threadIdx.y;     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                int j = threadIdx.x;  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    int z_col = j + d * i;
                                    int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }"""
            % locals()
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = (
            """
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
            const int width,
            const int c,
            const int d,
            const int step_x,
            const int step_y,
            const int grid_c,
            const int grid_d,
            const size_t stride0, const size_t stride1,
            const size_t stride2, const size_t stride3,
            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
            const size_t out_s0, const size_t out_s1,
            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
                left = left/grid_c;
                const int s = left%%nb_stack;
                left = left/nb_stack;
                const int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (int i = threadIdx.y; i < c; i+=blockDim.y)
                            {
                                int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                // loop over d
                                for (int j = threadIdx.x; j < d; j+=blockDim.x)
                                {
                                    int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    int z_col = j + d * i;
                                    int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }
        """
            % locals()
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var))
        return kernels
示例#21
0
    def gpu_kernels(self, node, nodename):
        CHARMAP = dict(int32='i',
                       uint32='I',
                       int64='l',
                       uint64='L',
                       float16='e',
                       float32='f',
                       float64='d')
        dtype_in = node.inputs[0].dtype
        dtype_out = node.outputs[0].dtype
        dtype_idx = node.inputs[1].dtype
        type_in = gpuarray.dtype_to_ctype(dtype_in)
        type_out = gpuarray.dtype_to_ctype(dtype_out)
        type_idx = gpuarray.dtype_to_ctype(dtype_idx)
        flags = Kernel.get_flags(dtype_in, dtype_out, dtype_idx)
        kname = "k_vector_select_fast"
        k_var = "k_vector_select_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_select_fast(const ga_size numRowsOut,
                                      const ga_size numColsOut,
                                      const ga_ssize stridesOut0,
                                      const ga_ssize stridesOut1,
                                      GLOBAL_MEM %(type_out)s *Out,
                                      const ga_size offset_Out,
                                      const ga_size numRowsIn,
                                      const ga_size numColsIn,
                                      const ga_ssize stridesIn0,
                                      const ga_ssize stridesIn1,
                                      GLOBAL_MEM %(type_in)s *In,
                                      const ga_size offset_In,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_idx)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      GLOBAL_MEM ga_int *err)
        {
             Out = (GLOBAL_MEM %(type_out)s *)(((GLOBAL_MEM char *)Out)+offset_Out);
             In = (GLOBAL_MEM %(type_in)s *)(((GLOBAL_MEM char *)In)+offset_In);
             indices_arr = (GLOBAL_MEM %(type_idx)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsIn; j += LDIM_0)
                  {
                      ga_ssize in_row = indices_arr[i * stridesIndices];
                      if (in_row < 0)
                          in_row += numRowsIn;
                      ga_ssize out_row = i;
                      if (in_row < numRowsIn && in_row >= 0) {
                        Out[(out_row * stridesOut0) + (j * stridesOut1)] = In[(in_row * stridesIn0) + (j * stridesIn1)];
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(type_in=type_in,
                   type_out=type_out,
                   type_idx=type_idx,
                   tc=CHARMAP[dtype_in])
        from pygpu.gpuarray import SIZE, SSIZE
        params = [
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE,
            SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE,
            gpuarray.GpuArray, SIZE, gpuarray.GpuArray
        ]
        return [
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var)
        ]
示例#22
0
    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        work_dnll = work_dtype(dtype_dnll)
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
        wtype_dnll = gpuarray.dtype_to_ctype(work_dnll)
        type_dnll = gpuarray.dtype_to_ctype(dtype_dnll)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
        kname = "kCrossEntropySoftmax1HotWithBiasDx"
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        params = [
            gpuarray.SIZE, gpuarray.SIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        sio = StringIO()
        print("""#include "cluda.h"

        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
           GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1,
           GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0,
           GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
            dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll);
            sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx);
            dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx);
            for (ga_int i = GID_0; i < N; i += GDIM_0)
            {
                %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
                for (ga_int j = LID_0; j < K; j += LDIM_0)
                {
                    if (y_i == j)
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0));
                    }
                    else
                    {
                        dx[i * dx_s0 + j * dx_s1] =
                            %(write_dx)s(dnll_i *
                              %(load_sm)s(sm[i * sm_s0 + j * sm_s1]));
                    }
                }
            }
        }
        """ % locals(), file=sio)
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#23
0
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
//a version that use less register but don't work in all case.
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
            const int width,
            const int c,
            const int d,
            const int step_x,
            const int step_y,
            const int grid_c,
            const int grid_d,
            const size_t stride0, const size_t stride1,
            const size_t stride2, const size_t stride3,
            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
            const size_t out_s0, const size_t out_s1,
            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
                left = left/grid_c;
                const int s = left%%nb_stack;
                left = left/nb_stack;
                const int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            int i = threadIdx.y;     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                int j = threadIdx.x;  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    int z_col = j + d * i;
                                    int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }""" % locals()
        params = [
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'uintp',
            'uintp',
            'uintp',
            'uintp',
            gpuarray.GpuArray,
            'uintp',
            'uintp',
            'uintp',
            gpuarray.GpuArray,
            'uintp',
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """
        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
            const int width,
            const int c,
            const int d,
            const int step_x,
            const int step_y,
            const int grid_c,
            const int grid_d,
            const size_t stride0, const size_t stride1,
            const size_t stride2, const size_t stride3,
            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
            const size_t out_s0, const size_t out_s1,
            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
                left = left/grid_c;
                const int s = left%%nb_stack;
                left = left/nb_stack;
                const int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (int i = threadIdx.y; i < c; i+=blockDim.y)
                            {
                                int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                // loop over d
                                for (int j = threadIdx.x; j < d; j+=blockDim.x)
                                {
                                    int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    int z_col = j + d * i;
                                    int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }
        """ % locals()
        params = [
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'intc',
            'uintp',
            'uintp',
            'uintp',
            'uintp',
            gpuarray.GpuArray,
            'uintp',
            'uintp',
            'uintp',
            gpuarray.GpuArray,
            'uintp',
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels
示例#24
0
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
// a version that uses less registers but doesn't work in all cases.
        KERNEL void %(kname)s(
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_idx_shift_x = c/2;
            const ga_int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    ga_int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }""" % locals()
        params = [
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """
        KERNEL void %(kname)s(
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_idx_shift_x = c/2;
            const ga_int wrap_centered_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if("%(mode)s"=="wrap_centered"){
                                    ten4_2 -= wrap_centered_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if("%(mode)s"=="wrap_centered"){
                                        ten4_3 -= wrap_centered_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    }

                                    ga_int ten4_idx = stride3*ten4_3 +
                                                   stride2*ten4_2 +
                                                   stride1*s + stride0*n;

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
            }
        }
        """ % locals()
        params = [
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels
示例#25
0
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
        mode_constants = self.BORDER_MODE.c_support_code()
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """#include "cluda.h"

        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED){
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }""" % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """#include "cluda.h"

        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }
        """ % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels
示例#26
0
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
                               manner_fn, manner_init,
                               b='', stride_b='', load_b='', dtype='float32'):
    """
    Return C++ code for a function that reduces a contiguous buffer.

    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.

    Parameters
    ----------
    N
        Length of the buffer.
    buf
        Buffer pointer of size warpSize * sizeof(dtype).
    x
        Input data.
    stride_x
        Input data stride.
    load_x
        Wrapper to read from x.
    pos
        Index of executing thread.
    count
        Number of executing threads.
    b
        Optional, pointer to the bias.
    stride_b
        Optional, the stride of b if b is provided.
    load_b
        Optional, wrapper to read from b if b is provided.
    dtype
        Optional, the dtype of the output.
    manner_fn
        A function that accepts strings of arguments a and b, and
        returns c code for their reduction.

          return "%(a)s + %(b)s"

        for a sum reduction.
    manner_init
        A function that accepts strings of arguments a and return c
        code for its initialization.

    Notes
    -----
    `buf` should be in gpu shared memory, we access it many times.

    """
    if b:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
                           " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
        loop_line = manner_fn("red",
                              manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
                                          "%(load_b)s(%(b)s[i * %(stride_b)s])" %
                                          locals()))
    else:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
        loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
                                                 locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
                           "%s[i]" % buf)
    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))

    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
        }
        buf[%(pos)s] = red;
        __syncthreads();
        if (%(pos)s < warpSize)
        {
            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
            {
                %(buf)s[%(pos)s] = %(loop_line2)s;
            }
            if (%(pos)s < 16)
            {
                //reduce so that %(pos)s 0 has the reduction of everything
                if(%(pos)s + 16 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_16)s;
                if(%(pos)s + 8 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_8)s;
                if(%(pos)s + 4 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_4)s;
                if(%(pos)s + 2 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_2)s;
                if(%(pos)s + 1 < %(N)s)
                    %(buf)s[%(pos)s] = %(r_1)s;
            }
        }
    }
    """ % locals()
示例#27
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        dtype_out = node.outputs[0].dtype
        itemsize_x = numpy.dtype(dtype_x).itemsize
        itemsize_y = numpy.dtype(dtype_y).itemsize
        itemsize_ind = numpy.dtype(dtype_ind).itemsize
        itemsize_out = numpy.dtype(dtype_out).itemsize
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        type_out = gpuarray.dtype_to_ctype(dtype_out)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
/*
 * This is an atomicAdd that works for doubles since that is not provided
 * natively by cuda.
 */
__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
    unsigned long long int* address_as_ull =
                                          (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__device__ ga_double atomicExch(ga_double *address, ga_double val) {
    return atomicExch((unsigned long long int *)address,
                      __double_as_longlong(val));
}

/*
 * This is a version of atomicAdd that works for half-floats.  It may
 * read and write 2 bytes more than the size of the array if the array
 * has an uneven number of elements.  The actual value at that spot
 * will not be modified.
 */

__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
      __half2float((ga_half)__byte_perm(old, 0,
                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, new_;
  old = *base;
  do {
    assumed = old;
    new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const int set_instead_of_inc,
                                      ga_int *err)
        {
             X = (%(type_x)s *)(((char *)X)+offset_X);
             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % locals()
        params = [
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int',
            gpuarray.GpuArray
        ]
        return [
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var)
        ]
示例#28
0
文件: nnet.py 项目: ballasn/Theano
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(work_x)
        type_b = gpuarray.dtype_to_ctype(work_b)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
            %(type_x)s* sm = sm_data + sms0 * row;

            %(type_x)s sum = 0.0;
            int row_max_j = 0;
            %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
                                    %(load_b)s(b[j*bs0]);
                %(type_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
                sm[j * sms1] = %(write_x)s(sm_ij);
            }
            %(type_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
                %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
                __tmp *= sum_inv;
                sm[j * sms1] = %(write_x)s(__tmp);
            }
            if ((y_idx >= N) || (y_idx < 0))
            {
                //TODO: set raise an error bit in a global var?
                nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least...
            }
            else
            {
                nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1])
                           - %(load_b)s(b[y_idx*bs0])
                           + row_max
                           + log(sum));
            }
            am_data[row*ams0] = row_max_j;
          }
        }
        """ % locals(), file=sio)
        params = [
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp'
            ]
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#29
0
    def gpu_kernels(self, node, nodename):
        # load kernel source
        device_type = node.inputs[0].type.context.kind
        kernel_ext = {b"cuda": ".cu", b"opencl": ".cl"}[device_type]
        common_ext = {b"cuda": ".cuh", b"opencl": ".h"}[device_type]

        # prepare "$" macros
        if device_type == b"cuda":
            ndim = node.inputs[0].ndim
            dstv_strides_code = "".join(
                f"ssize_t dstv_strides_{i}, " for i in range(ndim)
            )
            dsti_strides_code = "".join(
                f"ssize_t dsti_strides_{i}, " for i in range(ndim)
            )
            src_strides_code = "".join(
                f"ssize_t src_strides_{i}, " for i in range(ndim)
            )
            set_slice_code = """
        gidx = gid %% dims_%(i)d;
        gid /= dims_%(i)d;
        {dstv};
        {dsti};
        src = ptr_add(src, gidx*src_strides_%(i)d);\n""".format(
                dstv="dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)"
                if self.return_values
                else "",
                dsti="dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)"
                if self.return_indices
                else "",
            )
            set_slice_code = "".join(set_slice_code % dict(i=j) for j in range(1, ndim))
            if self.return_values:
                set_slice_code += """
                dstv = ptr_add(dstv, dstv_offset);
                """
            if self.return_indices:
                set_slice_code += """
                dsti = ptr_add(dsti, dsti_offset);
                """
            set_slice_code += """
                src = ptr_add(src, src_offset);
            """
            flags = Kernel.get_flags(node.inputs[0].dtype)
            subs = dict(
                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
                out_t=ga.dtype_to_ctype(self.idx_dtype),
                dims="".join(f"size_t dims_{i}, " for i in range(1, ndim)),
                dstv="INPUT_TYPE *dstv," if self.return_values else "",
                dstv_offset="size_t dstv_offset," if self.return_values else "",
                dsti="INDEX_TYPE *dsti," if self.return_indices else "",
                dsti_offset="size_t dsti_offset," if self.return_indices else "",
                dstv_strides=dstv_strides_code if self.return_values else "",
                dsti_strides=dsti_strides_code if self.return_indices else "",
                src_strides=src_strides_code,
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
                ndim=str(ndim),
            )
        elif device_type == b"opencl":
            raise NotImplementedError()

        # setup parameters
        param_types = [ga.SIZE] * (ndim - 1)  # dims
        for _ in range(self.return_values + self.return_indices):
            param_types.append(ga.GpuArray)  # dst*
            param_types.append(ga.SIZE)  # offset
            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
        param_types.append(ga.SIZE)  # offset
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
        param_types.append(ga.SIZE)  # size

        # load and compile kernels
        with open(
            os.path.join(
                os.path.dirname(__file__), "c_code", "topk_common" + common_ext
            )
        ) as f:
            common_src = f.read()

        kernels = []

        def build_kernel(fname, kname, subs):
            with open(os.path.join(os.path.dirname(__file__), "c_code", fname)) as f:
                kernel_src = f.read()
            ker = Kernel(
                code=(
                    "#include <cluda.h>\n"
                    + Template(common_src + kernel_src).substitute(**subs)
                ),
                name=kname,
                params=param_types,
                flags=flags,
                objvar=kname + nodename,
            )
            return ker

        subs["count_t"] = "int"
        kernels.append(build_kernel("topk_dense" + kernel_ext, "k_topk_dense", subs))
        subs["kname"] = "k_topk_dense_large"
        kernels.append(
            build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_large", subs)
        )
        subs["count_t"] = "long long"
        subs["kname"] = "k_topk_dense_xlarge"
        kernels.append(
            build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_xlarge", subs)
        )
        return kernels
示例#30
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
        load_x = load_w(node.inputs[0].dtype)
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
        type_acc = gpuarray.dtype_to_ctype(work_sm)

        ctype = gpuarray.dtype_to_ctype(work_sm)

        params = [
            gpuarray.SIZE, gpuarray.SIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE,
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
        code = """
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N;
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]);
                    buf[tx] += %(load_b)s(b[tx * sb0]);
                    buf2[tx] = buf[tx];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
                    buf[__i] = exp(buf2[__i] - row_max);
                    buf2[__i] = buf[__i];
                }
                local_barrier();
                {
                    // This function trashes buf[1..GA_WARP_SIZE],
                    // leaving the reduction result in buf[0].
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE)
                        {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
                    buf[__i] = buf2[__i] / row_sum;
                }
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]);
                }
                local_barrier();
            }
        }
        """ % locals()
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
        code = """
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
                       GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
        {
            GA_DECL_SHARED_BODY(%(type_acc)s, buf);
            x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x);
            b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b);
            sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm);
            for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){
                GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0];
                GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0];
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                        red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]));
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = max(buf[LID_0], buf[i]);
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]);
                        local_barrier();
                    }
                }
                %(ctype)s row_max = buf[0];
                local_barrier();
                {
                    // This function trashes buf[1..n_threads],
                    // leaving the reduction result in buf[0].
                    %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max);
                    #pragma unroll 16
                    for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) {
                    red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max);
                    }
                    buf[LID_0] = red;
                    local_barrier();
                    if (LID_0 < GA_WARP_SIZE) {
                        for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) {
                            buf[LID_0] = buf[LID_0] + buf[i];
                        }
                    }
                    local_barrier();
                    //reduce so that LID_0 0 has the reduction of everything
                    for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) {
                        if (LID_0 < _n && LID_0 + _n < N)
                            buf[LID_0] = buf[LID_0] + buf[LID_0+_n];
                        local_barrier();
                    }
                }
                %(ctype)s row_sum = buf[0];
                local_barrier();
                for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
                    sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
                }
                local_barrier();
            }
        }
        """ % locals()
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels
def inline_reduce_fixed_shared(N,
                               buf,
                               x,
                               stride_x,
                               load_x,
                               pos,
                               count,
                               manner_fn,
                               manner_init,
                               b='',
                               stride_b='',
                               load_b='',
                               dtype='float32'):
    """
    Return C++ code for a function that reduces a contiguous buffer.

    This function leaves the answer in position 0 of the buffer. The
    rest of the buffer is trashed by this function.

    Parameters
    ----------
    N
        Length of the buffer.
    buf
        Buffer pointer of size warpSize * sizeof(dtype).
    x
        Input data.
    stride_x
        Input data stride.
    load_x
        Wrapper to read from x.
    pos
        Index of executing thread.
    count
        Number of executing threads.
    manner_fn
        A function that accepts strings of arguments a and b, and
        returns c code for their reduction.

          return "%(a)s + %(b)s"

        for a sum reduction.
    manner_init
        A function that accepts strings of arguments a and return c
        code for its initialization.
    b
        Optional, pointer to the bias.
    stride_b
        Optional, the stride of b if b is provided.
    load_b
        Optional, wrapper to read from b if b is provided.
    dtype
        Optional, the dtype of the output.

    Notes
    -----
    `buf` should be in gpu shared memory, we access it many times.

    """
    if b:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
                           " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" %
                           locals())
        loop_line = manner_fn(
            "red",
            manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
                        "%(load_b)s(%(b)s[i * %(stride_b)s])" % locals()))
    else:
        init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" %
                           locals())
        loop_line = manner_fn(
            "red",
            manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" % locals()))
    loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
    r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos))

    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) {
          red = %(loop_line)s;
        }
        buf[%(pos)s] = red;
        __syncthreads();
        if (%(pos)s < warpSize) {
            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) {
                %(buf)s[%(pos)s] = %(loop_line2)s;
            }
        }
        __syncthreads();
        //reduce so that %(pos)s 0 has the reduction of everything
        for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) {
          if (%(pos)s < _n && %(pos)s + _n < %(N)s)
            %(buf)s[%(pos)s] = %(r_n)s;
          __syncthreads();
        }
    }
    """ % locals()
示例#32
0
文件: nnet.py 项目: Abioy/Theano
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        f = '' if dtype_x == 'float64' else 'f'
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            %(type_x)s* sm = sm_data + sms0 * row;

            extern LOCAL_MEM %(work_x)s per_thread_values[];
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM int row_max_threadIdx;

            %(work_x)s per_thread_row_max, per_thread_sum;
            int per_thread_row_max_j;

            // COMPUTE ROW MAX AND ARGMAX

            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;

            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[threadIdx.x] = per_thread_row_max;

            local_barrier();

            if (threadIdx.x == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (int j = 0; j < blockDim.x; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }

            local_barrier();

            // The thread with the higest max writes out which of its
            // values was the winner.
            if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;

            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }

            per_thread_values[threadIdx.x] = per_thread_sum;

            local_barrier();

            if (threadIdx.x == 0) {
              sum = 0.0;
              for (int j = 0; j < blockDim.x; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }

            local_barrier();

            for (int j = threadIdx.x; j < N; j += blockDim.x) {
              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
            }

            if (threadIdx.x == 0) {
              const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0];
              if ((y_idx >= N || y_idx < 0)) {
                // raise some suspicion.
                nll_data[row * nlls0] = %(write_x)s(0.0);
              } else {
                nll_data[row * nlls0] = %(write_x)s(
                   - %(load_x)s(x[y_idx * xs1])
                   - %(load_b)s(b[y_idx * bs0])
                   + row_max + log%(f)s(sum));
              }
            }
          }
        }
        """ % locals(), file=sio)
        params = [
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp'
            ]
        return [Kernel(code=sio.getvalue(), name=kname, params=params,
                       flags=flags, objvar=k_var)]
def inline_softmax_fixed_shared(N,
                                buf,
                                x,
                                stride_x,
                                load_x,
                                sm,
                                sm_stride,
                                write_sm,
                                threadPos,
                                threadCount,
                                b='',
                                stride_b='',
                                load_b='',
                                dtype="float32"):
    """
    Generate code to perform softmax with a fixed amount of shared
    memory.

    On entry, `buf` is assumed to be empty.

    On exit, `buf[0]` contains the softmax, `buf2` contains
    un-normalized softmax.

    Parameters
    ----------
    N
        Length of the buffer, atleast waprSize(32).
    buf
        A shared memory buffer of size warpSize * sizeof(dtype).
    x
        A ptr to the gpu memory where the row is stored.
    stride_x
        The stride between each element in x.
    load_x
        Wrapper to read from x.
    sm
        A ptr to the gpu memory to store the result.
    sm_stride
        The stride between each sm element.
    write_sm
        Wrapper before writing to sm.
    threadPos
        Index of executing thread.
    threadCount
        Number of executing threads.
    b
        Optional, pointer to the bias.
    stride_b
        Optional, the stride of b if b is provided.
    load_b
        Optional, wrapper to read from b if b is provided.
    dtype
        Optional, the dtype of the softmax's output if not float32.

    Notes
    -----
    `buf` should be in gpu shared memory, we access it many times.

    We use tx as an int variable in a loop.

    """
    ctype = gpuarray.dtype_to_ctype(dtype)
    ret = [
        # get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, threadPos,
                                       threadCount, b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
        ('%s row_max = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, threadPos,
                                   threadCount, lambda a, b: "%s + %s" %
                                   (a, b), lambda a: "exp(%s - row_max)" % a,
                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
        ('%s row_sum = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
    ]
    # This set all value correctly
    if b:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
            "  %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
            "            %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
            " / row_sum)" % locals()
        ]
    else:
        ret += [
            "%(sm)s[tx * %(sm_stride)s] = "
            "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
            " / row_sum)" % locals()
        ]
    ret += [
        "}",
        '__syncthreads()',
    ]
    return ret
示例#34
0
文件: nnet.py 项目: Abioy/Theano
 def gpu_kernels(self, node, nodename):
     dtype_x = node.inputs[0].dtype
     dtype_b = node.inputs[1].dtype
     dtype_sm = node.outputs[0].dtype
     load_x = load_w(node.inputs[0].dtype)
     load_b = load_w(node.inputs[1].dtype)
     write_sm = write_w(node.outputs[0].dtype)
     work_sm = work_dtype(node.outputs[0].dtype)
     flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
     type_x = gpuarray.dtype_to_ctype(dtype_x)
     type_b = gpuarray.dtype_to_ctype(dtype_b)
     type_sm = gpuarray.dtype_to_ctype(dtype_sm)
     type_acc = gpuarray.dtype_to_ctype(work_sm)
     params = [
         'uintp', 'uintp',
         gpuarray.GpuArray, 'uintp', 'intp', 'intp',
         gpuarray.GpuArray, 'uintp', 'intp',
         gpuarray.GpuArray, 'uintp', 'intp', 'intp'
         ]
     kernels = []
     kname = "kSoftmaxWithBias"
     k_var = "kSoftmaxWithBias_" + nodename
     code = nvcc_kernel(
         kname,
         params=['const ga_size M', 'const ga_size N',
                 'const %s * x' % type_x, 'const ga_size offset_x',
                 'const ga_ssize sx0', 'const ga_ssize sx1',
                 'const %s * b' % type_b, 'const ga_size offset_b',
                 'const ga_ssize sb0',
                 '%s * sm' % type_sm, 'const ga_size offset_sm',
                 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
         body=["extern __shared__ %s buf[]" % type_acc,
               "%s * buf2 = buf + N" % type_acc,
               "x = (const %s *)(((char *)x)+offset_x)" % type_x,
               "b = (const %s *)(((char *)b)+offset_b)" % type_b,
               "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
               "for (int blockIDX = blockIdx.x; blockIDX < M;"
               "     blockIDX += gridDim.x){",
               "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
               "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
               "buf[tx] += %s(b[tx * sb0])" % load_b,
               "buf2[tx] = buf[tx]",
               "}",
               "__syncthreads()",
               inline_softmax('N', 'buf', 'buf2',
                              'threadIdx.x', 'blockDim.x', work_sm),
               "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
               "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
               "}",
               "__syncthreads()",
               "}",
               ])
     kernels.append(Kernel(code=code, name=kname, params=params,
                           flags=flags, objvar=k_var))
     kname = "kSoftmaxWithBias_fixed_shared"
     k_var = "kSoftmaxWithBias_fixed_shared" + nodename
     code = nvcc_kernel(
         kname,
         params=['const ga_size M', 'const ga_size N',
                 'const %s * x' % type_x, 'const ga_size offset_x',
                 'const ga_ssize sx0', 'const ga_ssize sx1',
                 'const %s * b' % type_b, 'const ga_size offset_b',
                 'const ga_ssize sb0',
                 '%s * sm' % type_sm, 'const ga_size offset_sm',
                 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
         body=["extern __shared__ %s buf[]" % type_acc,
               "x = (const %s *)(((char *)x)+offset_x)" % type_x,
               "b = (const %s *)(((char *)b)+offset_b)" % type_b,
               "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
               "for (int blockIDX = blockIdx.x; blockIDX < M;"
               "     blockIDX += gridDim.x){",
               "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
               "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
               inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                           load_x,
                                           'sm_ptr', 'sm_s1', write_sm,
                                           'threadIdx.x', 'blockDim.x',
                                           'b', 'sb0', load_b, work_sm),
               "__syncthreads()",
               "}",
               ])
     kernels.append(Kernel(code=code, name=kname, params=params,
                           flags=flags, objvar=k_var))
     return kernels
示例#35
0
文件: nnet.py 项目: yhhzsd/Theano
    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
        itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
        type_intp = gpuarray.dtype_to_ctype(numpy.intp)
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
        ctx = sub['params']
        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: %(k_var)s: %%s.",
                             GpuKernel_error(&%(k_var)s, err));
                %(fail)s;
            }
        """ % locals()
        sync = ""
        if config.gpuarray.sync:
            sync = """
            err = GpuArray_sync(&%(z)s->ga);
            %(err_check)s
            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
                                        PyGpuArray_DIMS(%(dnll)s)[0] :
                                        (ssize_t) 0);

        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
        // or a vector with just one element.
        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
                                           (ssize_t) 0);

        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
            || (PyGpuArray_NDIM(%(sm)s) != 2)
            || (PyGpuArray_NDIM(%(y_idx)s) != 1))
        {
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_Format(PyExc_ValueError,
                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
                         %(dnll)s_dims0,
                         PyGpuArray_DIMS(%(sm)s)[0]);
            %(fail)s;
        }
        if (%(dnll)s_dims0 !=
            PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
        {
            PyErr_SetString(PyExc_ValueError,
                            "dnll.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(sm)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "sm.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if ((NULL == %(dx)s)
            || (PyGpuArray_DIMS(%(dx)s)[0] !=
                PyGpuArray_DIMS(%(sm)s)[0])
            || (PyGpuArray_DIMS(%(dx)s)[1] !=
                PyGpuArray_DIMS(%(sm)s)[1]))
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
                                 %(typecode_dx)s, GA_C_ORDER,
                                 %(ctx)s, Py_None);
            if (!%(dx)s) {
                %(fail)s
            }
        }
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
            void *kernel_params[] = {
                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
                (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
                (void *)&stride_DNLL0,
                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
                (void *)&stride_SM0, (void *)&stride_SM1,
                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
                (void *)&stride_YIDX0,
                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
                (void *)&stride_DX0, (void *)&stride_DX1};
            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
            %(err_check)s
            %(sync)s
        }
        assert(%(dx)s);
        """ % locals()
示例#36
0
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        dtype_out = node.outputs[0].dtype
        itemsize_x = numpy.dtype(dtype_x).itemsize
        itemsize_y = numpy.dtype(dtype_y).itemsize
        itemsize_ind = numpy.dtype(dtype_ind).itemsize
        itemsize_out = numpy.dtype(dtype_out).itemsize
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        type_out = gpuarray.dtype_to_ctype(dtype_out)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """
/*
 * This is an atomicAdd that works for doubles since that is not provided
 * natively by cuda.
 */
__device__ double atomicAdd(ga_double* address, ga_double val) {
    unsigned long long int* address_as_ull =
                                          (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

/*
 * This is a version of atomicAdd that works for half-floats.  It may
 * read and write 2 bytes more than the size of the array if the array
 * has an uneven number of elements.  The actual value at that spot
 * will not be modified.
 */

__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
      __half2float((ga_half)__byte_perm(old, 0,
                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
  return (ga_half)__byte_perm(old, 0,
                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
}

        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      ga_int *err)
        {
             X = (%(type_x)s *)(((char *)X)+offset_X);
             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % locals()
        params = [
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
            'uintp', 'intp', gpuarray.GpuArray, 'uintp', gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
示例#37
0
文件: nnet.py 项目: yhhzsd/Theano
    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        work_x = work_dtype(dtype_x)
        work_b = work_dtype(dtype_b)
        load_x = load_w(dtype_x)
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_b = gpuarray.dtype_to_ctype(dtype_b)
        work_x = gpuarray.dtype_to_ctype(work_x)
        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
        kname = "k_xent_sm_1hot_bias"
        k_var = "k_xent_sm_1hot_bias_" + nodename
        f = '' if dtype_x == 'float64' else 'f'
        sio = StringIO()
        print("""
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            const %(type_x)s* x_data, const ga_size offset_x,
            const ga_ssize xs0, const ga_ssize xs1,
            const %(type_b)s* b, const ga_size offset_b,
            const ga_ssize bs0,
            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
            const ga_ssize y_idxs0,
            %(type_x)s* nll_data, const ga_size offset_nll,
            const ga_ssize nlls0,
            %(type_x)s* sm_data, const ga_size offset_sm,
            const ga_ssize sms0, const ga_ssize sms1,
            %(type_y_idx)s* am_data, const ga_size offset_am,
            const ga_ssize ams0)
        {
          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
          b = (const %(type_b)s *)(((char *)b)+offset_b);
          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);

          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const %(type_x)s* x = x_data + xs0 * row;
            %(type_x)s* sm = sm_data + sms0 * row;

            extern LOCAL_MEM %(work_x)s per_thread_values[];
            LOCAL_MEM %(work_x)s row_max, sum, sum_inv;
            LOCAL_MEM int row_max_threadIdx;

            %(work_x)s per_thread_row_max, per_thread_sum;
            int per_thread_row_max_j;

            // COMPUTE ROW MAX AND ARGMAX

            // compute separate per-thread maximums and argmaxes
            per_thread_row_max = NAN;
            per_thread_row_max_j = 0;

            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
              per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max);
            }
            per_thread_values[threadIdx.x] = per_thread_row_max;

            local_barrier();

            if (threadIdx.x == 0) {
              row_max = NAN;
              row_max_threadIdx = 0;
              for (int j = 0; j < blockDim.x; j++)
              {
                %(work_x)s per_thread_max = per_thread_values[j];
                row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
                row_max = fmax%(f)s(per_thread_max, row_max);
              }
            }

            local_barrier();

            // The thread with the higest max writes out which of its
            // values was the winner.
            if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j;

            // COMPUTE SOFTMAX
            per_thread_sum = 0.0;
            for (int j = threadIdx.x; j < N; j += blockDim.x)
            {
              %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]);
              %(work_x)s sm_ij = exp%(f)s(row_ij - row_max);
              per_thread_sum += sm_ij;
              sm[j * sms1] = %(write_x)s(sm_ij);
            }

            per_thread_values[threadIdx.x] = per_thread_sum;

            local_barrier();

            if (threadIdx.x == 0) {
              sum = 0.0;
              for (int j = 0; j < blockDim.x; j++) {
                sum += per_thread_values[j];
              }
              sum_inv = 1.0 / sum;
            }

            local_barrier();

            for (int j = threadIdx.x; j < N; j += blockDim.x) {
              sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv);
            }

            if (threadIdx.x == 0) {
              const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0];
              if ((y_idx >= N || y_idx < 0)) {
                // raise some suspicion.
                nll_data[row * nlls0] = %(write_x)s(0.0);
              } else {
                nll_data[row * nlls0] = %(write_x)s(
                   - %(load_x)s(x[y_idx * xs1])
                   - %(load_b)s(b[y_idx * bs0])
                   + row_max + log%(f)s(sum));
              }
            }
          }
        }
        """ % locals(),
              file=sio)
        params = [
            'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp',
            gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp',
            'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray,
            'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp'
        ]
        return [
            Kernel(code=sio.getvalue(),
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var)
        ]