def c_code(self, node, name, inp, out, sub): ndim = len(inp) zz = out[0] fail = sub['fail'] code = [""" int i; size_t shape[%(ndim)s]; """ % dict(ndim=ndim)] for i, shp_i in enumerate(inp): code.append(""" shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; """ % dict(i=i, shp_i=shp_i)) code.append(""" if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, pygpu_default_context())) { %(fail)s } """ % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype), fail=fail)) return ''.join(code)
def c_code(self, node, name, inp, out, sub): ndim = len(inp) zz = out[0] fail = sub["fail"] code = [ """ int i; size_t shape[%(ndim)s]; """ % dict(ndim=ndim) ] for i, shp_i in enumerate(inp): code.append( """ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; """ % dict(i=i, shp_i=shp_i) ) code.append( """ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, %(ctx)s)) { %(fail)s } """ % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype), fail=fail, ctx=sub["params"]) ) return "".join(code)
def __init__(self, dtype, broadcastable, name=None): # In case this was not provided and no global value is available self.dtype = str(dtype) self.broadcastable = tuple(bool(b) for b in broadcastable) self.ndim = len(self.broadcastable) self.name = name try: self.typecode = gpuarray.dtype_to_typecode(self.dtype) except gpuarray.GpuArrayException: raise TypeError("Unsupported dtype for %s: %s" % (self.__class__.__name__, self.dtype))
def __init__(self, dtype, broadcastable, context_name=None, name=None): # In case this was not provided and no global value is available self.dtype = str(dtype) self.broadcastable = tuple(bool(b) for b in broadcastable) self.ndim = len(self.broadcastable) self.name = name self.context_name = context_name # This will check that the passed context name is valid and registered. get_context(self.context_name) try: self.typecode = gpuarray.dtype_to_typecode(self.dtype) except gpuarray.GpuArrayException: raise TypeError("Unsupported dtype for %s: %s" % (self.__class__.__name__, self.dtype))
def m(t): if t == gpuarray.GpuArray: return "GA_BUFFER" else: return str(gpuarray.dtype_to_typecode(t))
def get_op_params(self): return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
def c_code(self, node, name, inputs, outputs, sub): nd = node.outputs[0].ndim fail = sub["fail"] initial_dims = ','.join('1' for i in xrange(nd)) opname = str(self.scalar_op) #check that all inputs have valid dimensions emitted_inames = {} code = """ int n_blocks = 0; int threads_per_block = 0; size_t numEls = 0; """ if nd > 0: code += """ size_t dims[%(nd)s] = {%(initial_dims)s}; """ % locals() else: code += """ size_t *dims = NULL; """ for idx, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is node.inputs[idx] continue broadcasts = map(int, node.inputs[idx].broadcastable) broadcasts = ', '.join(map(str, broadcasts)) nd = node.inputs[idx].ndim if nd > 0: code += """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() else: code += """ int *broadcasts_%(iname)s = NULL; """ % locals() emitted_inames[iname] = node.inputs[idx] #check that all inputs have valid dimensions emitted_inames = {} for idx, iname in enumerate(inputs): if iname in emitted_inames: continue code += """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", PyGpuArray_NDIM(%(iname)s)); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && PyGpuArray_DIMS(%(iname)s)[i] == 1)) && (dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. Input" " %(idx)d (indices start at 0) has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): typecode = dtype_to_typecode(node.outputs[idx].dtype) if idx not in self.inplace_pattern.keys(): code += """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS)) { Py_XDECREF(%(oname)s); %(oname)s = NULL; } if (NULL == %(oname)s) { %(oname)s = pygpu_empty(%(nd)d, dims, %(typecode)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(oname)s) { //TODO, this check don't seam good. //TODO, set exception? %(fail)s } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] code += """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { PyErr_Format(PyExc_ValueError, "GpuElemwise. Output dimension mis-match. Output" " %(idx)d (indices start at 0), working inplace" " on input %(input_idx)s, has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(oname)s)[i], dims[i] ); Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() z = outputs[0] code += """numEls = PyGpuArray_SIZE(%(z)s); //first use at least a full warp threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE //next start adding multiprocessors // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now. n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (size_t)30); // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (size_t) 256); //std::cerr << "calling callkernel returned\\n"; """ % locals() code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals() param = [] for i in range(nd): param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], i=i)) for n, (name, var) in enumerate(zip(inputs + outputs, node.inputs + node.outputs)): if (n - len(inputs)) in self.inplace_pattern: continue dtype = dtype_to_ctype(var.dtype) param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("%(name)s->ga.offset" % locals()) for i in range(nd): param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) code += ',\n'.join(param) + ");\n" if config.gpuarray.sync: code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz) return str(code)
def c_code(self, node, nodename, inps, outs, sub): context = node.inputs[0].type.context if context.kind != b'cuda': raise NotImplementedError('%s: We only have CUDA ' 'implementation so far.' % self.__class__.__name__) x, k = inps inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype) if not self.return_indices: yv, = outs elif self.return_values: yv, yi = outs else: yi, = outs out_dtype_s = self.idx_dtype out_dtc = ga.dtype_to_typecode(out_dtype_s) fail = sub['fail'] ctx = sub['params'] k_dtype = node.inputs[1].type.dtype_specs()[1] # max threads per block MAX_TPB = context.maxlsize # max blocks per grid MAX_BPG = context.maxgsize0 WARP_SIZE = 32 ndim = node.inputs[0].ndim reordered_axes = list(range(ndim)) axis = self.axis % ndim del (reordered_axes[axis]) reordered_axes = [axis] + reordered_axes dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:]) prep_output = '' if self.return_values: def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % ( yv, yv) params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes) prep_output += ''' if (0 != theano_prep_output( &%(yv)s, %(ndim)d, odims, %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n''' % locals() else: def_dvstrides = params_dv = '' if self.return_indices: def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes) prep_output += ''' if (0 != theano_prep_output( &%(yi)s, %(ndim)d, odims, %(out_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n''' % locals() else: def_distrides = params_di = '' sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes) code = ''' { const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0]; const size_t *dims = PyGpuArray_DIMS(%(x)s); size_t odims[%(ndim)d]; for (int i=0; i<%(ndim)d; i++) odims[i] = dims[i]; odims[%(axis)d] = k_>=0 ? k_ : -k_; if (0 == odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth must not be zero"); %(fail)s; } else if (dims[%(axis)d] < odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth cannot be larger than the size of specified axis %(axis)d"); %(fail)s; } %(prep_output)s size_t grid_size=1, block_size=1; for (int i=0; i<%(ndim)d; ++i) { if (i!=%(axis)d) grid_size *= dims[i]; else block_size = dims[i]; } // round up to multiples of warp size block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d; if (grid_size > %(MAX_BPG)d) { PyErr_SetString( PyExc_ValueError, "topk: too many slices to work with, expected <= %(MAX_BPG)d"); %(fail)s; } %(def_dvstrides)s; %(def_distrides)s; const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s); void* args[] = { %(dims)s %(params_dv)s %(params_di)s (void*)(&k_), (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)), %(sstrides)s, (void*)(dims+%(axis)d), }; int err; if (dims[%(axis)d] > (1u << 31)) { block_size = %(MAX_TPB)d; err = GpuKernel_call( &k_topk_dense_xlarge%(nodename)s, 1, &grid_size, &block_size, 0, args); } else if (block_size > %(MAX_TPB)d) { block_size = %(MAX_TPB)d; err = GpuKernel_call( &k_topk_dense_large%(nodename)s, 1, &grid_size, &block_size, 0, args); } else { err = GpuKernel_call( &k_topk_dense%(nodename)s, 1, &grid_size, &block_size, 0, args); } if (err != GA_NO_ERROR) { PyErr_SetString( PyExc_RuntimeError, "topk: gpu kernel failed to execute"); %(fail)s; } } ''' return code % locals()
def c_code(self, node, nodename, inps, outs, sub): context = node.inputs[0].type.context if context.kind != b'cuda': raise NotImplementedError( '%s: We only have CUDA ' 'implementation so far.' % self.__class__.__name__) x, k = inps inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype) if not self.return_indices: yv, = outs elif self.return_values: yv, yi = outs else: yi, = outs out_dtype_s = self.idx_dtype out_dtc = ga.dtype_to_typecode(out_dtype_s) fail = sub['fail'] ctx = sub['params'] k_dtype = node.inputs[1].type.dtype_specs()[1] # max threads per block MAX_TPB = context.maxlsize # max blocks per grid MAX_BPG = context.maxgsize0 WARP_SIZE = 32 ndim = node.inputs[0].ndim reordered_axes = list(range(ndim)) axis = self.axis % ndim del(reordered_axes[axis]) reordered_axes = [axis] + reordered_axes dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:]) prep_output = '' if self.return_values: def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv) params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes) prep_output += ''' if (0 != theano_prep_output( &%(yv)s, %(ndim)d, odims, %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n''' % locals() else: def_dvstrides = params_dv = '' if self.return_indices: def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes) prep_output += ''' if (0 != theano_prep_output( &%(yi)s, %(ndim)d, odims, %(out_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n''' % locals() else: def_distrides = params_di = '' sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes) code = ''' { const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0]; const size_t *dims = PyGpuArray_DIMS(%(x)s); size_t odims[%(ndim)d]; for (int i=0; i<%(ndim)d; i++) odims[i] = dims[i]; odims[%(axis)d] = k_>=0 ? k_ : -k_; if (0 == odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth must not be zero"); %(fail)s; } else if (dims[%(axis)d] < odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth cannot be larger than the size of specified axis %(axis)d"); %(fail)s; } %(prep_output)s size_t grid_size=1, block_size=1; for (int i=0; i<%(ndim)d; ++i) { if (i!=%(axis)d) grid_size *= dims[i]; else block_size = dims[i]; } // round up to multiples of warp size block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d; if (grid_size > %(MAX_BPG)d) { PyErr_SetString( PyExc_ValueError, "topk: too many slices to work with, expected <= %(MAX_BPG)d"); %(fail)s; } %(def_dvstrides)s; %(def_distrides)s; const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s); void* args[] = { %(dims)s %(params_dv)s %(params_di)s (void*)(&k_), (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)), %(sstrides)s, (void*)(dims+%(axis)d), }; int err; if (dims[%(axis)d] > (1u << 31)) { block_size = %(MAX_TPB)d; err = GpuKernel_call( &k_topk_dense_xlarge%(nodename)s, 1, &grid_size, &block_size, 0, args); } else if (block_size > %(MAX_TPB)d) { block_size = %(MAX_TPB)d; err = GpuKernel_call( &k_topk_dense_large%(nodename)s, 1, &grid_size, &block_size, 0, args); } else { err = GpuKernel_call( &k_topk_dense%(nodename)s, 1, &grid_size, &block_size, 0, args); } if (err != GA_NO_ERROR) { PyErr_SetString( PyExc_RuntimeError, "topk: gpu kernel failed to execute"); %(fail)s; } } ''' return code % locals()
def c_code(self, node, nodename, inps, outs, sub): context = node.inputs[0].type.context if context.kind != b"cuda": raise NotImplementedError("%s: We only have CUDA " "implementation so far." % self.__class__.__name__) x, k = inps inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype) if not self.return_indices: (yv, ) = outs elif self.return_values: yv, yi = outs else: (yi, ) = outs out_dtype_s = self.idx_dtype out_dtc = ga.dtype_to_typecode(out_dtype_s) fail = sub["fail"] ctx = sub["params"] k_dtype = node.inputs[1].type.dtype_specs()[1] # max threads per block MAX_TPB = context.maxlsize0 # max blocks per grid MAX_BPG = context.maxgsize0 WARP_SIZE = 32 ndim = node.inputs[0].ndim reordered_axes = list(range(ndim)) axis = self.axis % ndim del reordered_axes[axis] reordered_axes = [axis] + reordered_axes dims = "".join("dims[%d], " % i for i in reordered_axes[1:]) prep_output = "" if self.return_values: def_dvstrides = "const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)" % yv params_dv = "%s->ga.data, %s->ga.offset,\n" % (yv, yv) params_dv += "".join("dvstrides[%d], " % i for i in reordered_axes) prep_output += (""" if (0 != theano_prep_output( &%(yv)s, %(ndim)d, odims, %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n""" % locals()) else: def_dvstrides = params_dv = "" if self.return_indices: def_distrides = "const ssize_t *distrides = PyGpuArray_STRIDES(%s)" % yi params_di = "%s->ga.data, %s->ga.offset,\n" % (yi, yi) params_di += "".join("distrides[%d], " % i for i in reordered_axes) prep_output += (""" if (0 != theano_prep_output( &%(yi)s, %(ndim)d, odims, %(out_dtc)s, GA_C_ORDER, %(ctx)s)) { %(fail)s; }\n""" % locals()) else: def_distrides = params_di = "" sstrides = ", ".join("sstrides[%d]" % i for i in reordered_axes) code = """ { const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0]; const size_t *dims = PyGpuArray_DIMS(%(x)s); size_t odims[%(ndim)d]; for (int i=0; i<%(ndim)d; i++) odims[i] = dims[i]; odims[%(axis)d] = k_>=0 ? k_ : -k_; if (0 == odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth must not be zero"); %(fail)s; } else if (dims[%(axis)d] < odims[%(axis)d]) { PyErr_SetString( PyExc_ValueError, "topk: kth cannot be larger than the size of specified axis %(axis)d"); %(fail)s; } %(prep_output)s size_t grid_size=1, block_size=1; for (int i=0; i<%(ndim)d; ++i) { if (i!=%(axis)d) grid_size *= dims[i]; else block_size = dims[i]; } // round up to multiples of warp size block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d; if (grid_size > %(MAX_BPG)d) { PyErr_SetString( PyExc_ValueError, "topk: too many slices to work with, expected <= %(MAX_BPG)d"); %(fail)s; } %(def_dvstrides)s; %(def_distrides)s; const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s); int err; if (dims[%(axis)d] > (1u << 31)) { block_size = %(MAX_TPB)d; err = k_topk_dense_xlarge_call( 1, &grid_size, &block_size, 0, %(dims)s %(params_dv)s %(params_di)s k_, %(x)s->ga.data, %(x)s->ga.offset, %(sstrides)s, dims[%(axis)d] ); } else if (block_size > %(MAX_TPB)d) { block_size = %(MAX_TPB)d; err = k_topk_dense_large_call( 1, &grid_size, &block_size, 0, %(dims)s %(params_dv)s %(params_di)s k_, %(x)s->ga.data, %(x)s->ga.offset, %(sstrides)s, dims[%(axis)d] ); } else { err = k_topk_dense_call( 1, &grid_size, &block_size, 0, %(dims)s %(params_dv)s %(params_di)s k_, %(x)s->ga.data, %(x)s->ga.offset, %(sstrides)s, dims[%(axis)d] ); } if (err != GA_NO_ERROR) { PyErr_SetString( PyExc_RuntimeError, "topk: gpu kernel failed to execute"); %(fail)s; } } """ return code % locals()
def c_code(self, node, name, inp, out, sub): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): # We special case the no-reduction case since the gpu # kernel has trouble handling it. return """ Py_XDECREF(%(out)s); %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER); if (!%(out)s) { %(fail)s } if (%(sync)d) GpuArray_sync(&%(out)s->ga); """ % dict(out=out[0], inp=inp[0], fail=sub['fail'], sync=bool(config.gpuarray.sync)) k = self.get_kernel_cache(node) _, src, _, ls = k._get_basic_kernel(k.init_local_size, node.inputs[0].ndim) if self.axis is None: redux = [True] * node.inputs[0].ndim else: redux = self.redux acc_dtype = getattr(self, 'acc_dtype', None) if acc_dtype is None: acc_dtype = node.outputs[0].type.dtype input = inp[0] output = out[0] nd_out = node.outputs[0].ndim code = """ size_t gs = 1; unsigned int n = 1; unsigned int proxy_dim[%(nd_in)s]; unsigned int proxy_off; int proxy_str[%(nd_in)s]; void *args[%(n_args)s]; PyGpuArrayObject *tmp; int err; """ % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim) if nd_out != 0: code += """ size_t out_dims[%(nd_out)s]; int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s; """ % dict(nd_out=nd_out, output=output) j = 0 for i in range(node.inputs[0].ndim): if not self.redux[i]: code += """ out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s]; if (!need_out) need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s]; """ % dict(j=j, i=i, input=input, output=output) j += 1 code += """ if (need_out) { %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(output)s) { %(fail)s } } """ % dict(output=output, nd_out=nd_out, fail=sub['fail'], out_type=dtype_to_typecode(node.outputs[0].type.dtype)) else: code += """ if (%(output)s == NULL || %(output)s->ga.nd != 0) { Py_XDECREF(%(output)s); %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(output)s) { %(fail)s } } """ % dict(output=output, fail=sub['fail'], out_type=dtype_to_typecode(node.outputs[0].type.dtype)) if acc_dtype != node.outputs[0].type.dtype: code += """ tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions, %(acc_type)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!tmp) %(fail)s """ % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype)) else: code += """ tmp = %(output)s; Py_INCREF(tmp); """ % dict(output=output) # We need the proxies since we are passing a pointer to the # data into the call and therefore we need a real copy of the # data in the proper type. code += """ args[0] = &n; args[1] = &tmp->ga; """ % dict(output=output) p = 2 for i in range(node.inputs[0].ndim): code += """ proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s]; args[%(p)s] = &proxy_dim[%(i)s]; n *= %(input)s->ga.dimensions[%(i)s]; """ % dict(i=i, p=p, input=input) p += 1 if not redux[i]: code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i) code += """ args[%(p)s] = &%(input)s->ga; proxy_off = %(input)s->ga.offset; args[%(p)s+1] = &proxy_off; """ % dict(p=p, input=input) p += 2 for i in range(node.inputs[0].ndim): code += """ proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s]; args[%(p)s] = &proxy_str[%(i)s]; """ % dict(p=p, i=i, input=input) p += 1 code += """ if (gs == 0) gs = 1; n /= gs; err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "compyte error: GpuCAReduce: %%s.", GpuKernel_error(&%(k_var)s, err)); %(fail)s } if (%(cast_out)d) { err = GpuArray_move(&%(output)s->ga, &tmp->ga); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "compyte error: GpuCAReduce [cast]: %%s.", GpuArray_error(&tmp->ga, err)); %(fail)s } } else { Py_XDECREF(%(output)s); %(output)s = tmp; } if (%(sync)d) GpuArray_sync(&%(output)s->ga); """ % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync), ls=ls, fail=sub['fail'], output=output, input=input, cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) return code
def get_params(self, node): from pygpu.gpuarray import dtype_to_typecode return self.params_type.get_params(typecode=dtype_to_typecode(self.dtype), context=get_context(self.context_name))
def get_op_params(self): from pygpu.gpuarray import dtype_to_typecode return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
def get_params(self, node): from pygpu.gpuarray import dtype_to_typecode return self.params_type.get_params( typecode=dtype_to_typecode(self.dtype), context=get_context(self.context_name))
def c_code(self, node, name, inputs, outputs, sub): nd = node.outputs[0].ndim fail = sub["fail"] initial_dims = ','.join('1' for i in xrange(nd)) opname = str(self.scalar_op) #check that all inputs have valid dimensions emitted_inames = {} code = """ int n_blocks = 0; int threads_per_block = 0; size_t numEls = 0; """ if nd > 0: code += """ size_t dims[%(nd)s] = {%(initial_dims)s}; """ % locals() else: code += """ size_t *dims = NULL; """ for idx, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is node.inputs[idx] continue broadcasts = map(int, node.inputs[idx].broadcastable) broadcasts = ', '.join(map(str, broadcasts)) nd = node.inputs[idx].ndim if nd > 0: code += """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() else: code += """ int *broadcasts_%(iname)s = NULL; """ % locals() emitted_inames[iname] = node.inputs[idx] #check that all inputs have valid dimensions emitted_inames = {} for idx, iname in enumerate(inputs): if iname in emitted_inames: continue code += """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", PyGpuArray_NDIM(%(iname)s)); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && PyGpuArray_DIMS(%(iname)s)[i] == 1)) && (dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. Input" " %(idx)d (indices start at 0) has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): typecode = dtype_to_typecode(node.outputs[idx].dtype) if idx not in self.inplace_pattern.keys(): code += """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS)) { Py_XDECREF(%(oname)s); %(oname)s = NULL; } if (NULL == %(oname)s) { %(oname)s = pygpu_empty(%(nd)d, dims, %(typecode)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(oname)s) { //TODO, this check don't seam good. //TODO, set exception? %(fail)s } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] code += """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { PyErr_Format(PyExc_ValueError, "GpuElemwise. Output dimension mis-match. Output" " %(idx)d (indices start at 0), working inplace" " on input %(input_idx)s, has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(oname)s)[i], dims[i] ); Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() z = outputs[0] code += """numEls = PyGpuArray_SIZE(%(z)s); //first use at least a full warp threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE //next start adding multiprocessors // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now. n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (size_t)30); // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (size_t) 256); //std::cerr << "calling callkernel returned\\n"; """ % locals() code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals() param = [] for i in range(nd): param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], i=i)) for n, (name, var) in enumerate(zip(inputs + outputs, node.inputs + node.outputs)): if (n - len(inputs)) in self.inplace_pattern: continue dtype = var.dtype param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("%(name)s->ga.offset" % locals()) for i in range(nd): param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) code += ',\n'.join(param) + ");\n" if config.gpuarray.sync: code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz) return str(code)