def c_code(self, node, nodename, inp, out, sub): x, a, b, l, s = inp y, = out fail = sub['fail'] function_name = "TimCropper_%(nodename)s" % locals() ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial strings = [] # check inputs strings.append(""" if ($x->nd != $ndim_total) { PyErr_SetString(PyExc_ValueError, "TimCropper: first input must have $ndim_total dimensions"); $fail; } """) for i, var in enumerate((a, b, l, s)): strings.append(""" if (%(var)s->nd != 2) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have 2 dimensions"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[0] != CudaNdarray_HOST_DIMS($x)[0]) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[0] equal to batch size"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[1] != $ndim_spatial) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[1] equal to number of spatial dimensions ($ndim_spatial)"); $fail; } """ % dict(var=var, i=1 + i)) # allocate output strings.append(""" int ydims[$ndim_total]; """) for i in (0, 1): strings.append("ydims[%i] = CudaNdarray_HOST_DIMS($x)[%i];" % (i, i)) for i, dim in enumerate(self.patch_shape): strings.append("ydims[2 + %i] = %i;" % (i, dim)) strings.append(""" if ((NULL == $y) || """ + " || ".join( "(CudaNdarray_HOST_DIMS($y)[%i] != ydims[%i])" % (i, i) for i in range(ndim_total)) + """) { Py_XDECREF($y); $y = (CudaNdarray*)CudaNdarray_New(); if ((NULL == $y) || CudaNdarray_alloc_contiguous($y, $ndim_total, ydims)) { Py_XDECREF($y); $y = NULL; PyErr_SetString(PyExc_ValueError, "TimCropper: output allocation failed"); $fail; } } """) # due to separability we need to compute weights only for patch # row/image row and patch col/image col pairs, instead of for # the full cartesian product of patch pixel/image pixel pairs. # we precompute the weights in a separate pass. weightpass_call = common.weightpass_call( nodename, patch_shape=self.patch_shape, V=x, l=l, s=s, fail=fail, grad=False) # launch kernel W = "W" # so call_arguments knows its name arguments = common.call_arguments("x y W".split()) gridblock = common.gridblock(ndim_spatial, "ydims") strings.append(""" { $weightpass_call $gridblock $function_name<<<grid, block>>>(""" + arguments + """) CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i;" " block: %i x %i x %i)\\n", "$function_name", cudaGetErrorString(err), grid.x, grid.y, block.x, block.y, block.z); $fail; } // free weight storage Py_XDECREF(W); W = NULL; }""") from string import Template return Template("\n".join(strings)).substitute(locals())
def c_code(self, node, nodename, inp, out, sub): x, a, b, l, s = inp y, = out fail = sub['fail'] function_name = "TimCropper_%(nodename)s" % locals() ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial strings = [] # check inputs strings.append(""" if ($x->nd != $ndim_total) { PyErr_SetString(PyExc_ValueError, "TimCropper: first input must have $ndim_total dimensions"); $fail; } """) for i, var in enumerate((a, b, l, s)): strings.append(""" if (%(var)s->nd != 2) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have 2 dimensions"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[0] != CudaNdarray_HOST_DIMS($x)[0]) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[0] equal to batch size"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[1] != $ndim_spatial) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[1] equal to number of spatial dimensions ($ndim_spatial)"); $fail; } """ % dict(var=var, i=1 + i)) # allocate output strings.append(""" int ydims[$ndim_total]; """) for i in (0, 1): strings.append("ydims[%i] = CudaNdarray_HOST_DIMS($x)[%i];" % (i, i)) for i, dim in enumerate(self.patch_shape): strings.append("ydims[2 + %i] = %i;" % (i, dim)) strings.append(""" if ((NULL == $y) || """ + " || ".join( "(CudaNdarray_HOST_DIMS($y)[%i] != ydims[%i])" % (i, i) for i in range(ndim_total)) + """) { Py_XDECREF($y); $y = (CudaNdarray*)CudaNdarray_New(); if ((NULL == $y) || CudaNdarray_alloc_contiguous($y, $ndim_total, ydims)) { Py_XDECREF($y); $y = NULL; PyErr_SetString(PyExc_ValueError, "TimCropper: output allocation failed"); $fail; } } """) # due to separability we need to compute weights only for patch # row/image row and patch col/image col pairs, instead of for # the full cartesian product of patch pixel/image pixel pairs. # we precompute the weights in a separate pass. weightpass_call = common.weightpass_call(nodename, patch_shape=self.patch_shape, V=x, l=l, s=s, fail=fail, grad=False) # launch kernel W = "W" # so call_arguments knows its name arguments = common.call_arguments("x y W".split()) gridblock = common.gridblock(ndim_spatial, "ydims") strings.append(""" { $weightpass_call $gridblock $function_name<<<grid, block>>>(""" + arguments + """) CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i;" " block: %i x %i x %i)\\n", "$function_name", cudaGetErrorString(err), grid.x, grid.y, block.x, block.y, block.z); $fail; } // free weight storage Py_XDECREF(W); W = NULL; }""") from string import Template return Template("\n".join(strings)).substitute(locals())
def c_code(self, node, nodename, inp, out, sub): dCdy, x, a, b, l, s = inp dydl, dyds = out fail = sub["fail"] function_name = "TimCropperGrad_%s" % nodename ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial strings = [] # check inputs for i, var in enumerate([dCdy, x]): strings.append( """ if (%(var)s->nd != $ndim_total) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have $ndim_total dimensions"); $fail; } """ % dict(var=var, i=i) ) for i, var in enumerate((a, b, l, s)): strings.append( """ if (%(var)s->nd != 2) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have 2 dimensions"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[0] != CudaNdarray_HOST_DIMS($x)[0]) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[0] equal to batch size"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[1] != $ndim_spatial) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[1] equal to number of spatial dimensions ($ndim_spatial)"); $fail; } """ % dict(var=var, i=2 + i) ) # allocate outputs strings.append( """ int outdims[$ndim_total + 1]; outdims[$ndim_total] = $ndim_spatial; """ ) for i in range(ndim_total): strings.append("outdims[%i] = CudaNdarray_HOST_DIMS($dCdy)[%i];" % (i, i)) for var in "ls": strings.append( ( """ if ((NULL == %(dname)s) || """ + " || ".join( "(CudaNdarray_HOST_DIMS(%%(dname)s)[%i] != outdims[%i])" % (i, i) for i in range(ndim_total + 1) ) + """) { Py_XDECREF(%(dname)s); %(dname)s = (CudaNdarray*)CudaNdarray_New(); if ((NULL == %(dname)s) || CudaNdarray_alloc_contiguous(%(dname)s, $ndim_total + 1, outdims)) { Py_XDECREF(%(dname)s); %(dname)s = NULL; PyErr_SetString(PyExc_ValueError, "TimCropperGrad: allocation of output %(dlabel)s failed"); $fail; } } """ ) % dict(name=locals()[var], dname=locals()["dyd" + var], dlabel="dyd" + var) ) # launch kernel arguments = [] for var in "dydl dyds dCdy x".split(): arguments.append("CudaNdarray_SIZE($%s)" % var) arguments.append("CudaNdarray_DEV_DATA($%s)" % var) arguments.append("CudaNdarray_DEV_DIMS($%s)" % var) arguments.append("CudaNdarray_DEV_STRIDES($%s)" % var) arguments.extend("CudaNdarray_DEV_DATA($%s)" % var for var in "abls") gridblock = common.gridblock(ndim_spatial, "CudaNdarray_HOST_DIMS(%s)" % dCdy) strings.append( """ { printf("enter grad op\\n"); $gridblock $function_name<<<grid, block>>>(""" + ", \n".join(arguments) + """) CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i;" " block: %i x %i x %i)\\n", "$function_name", cudaGetErrorString(err), grid.x, grid.y, block.x, block.y, block.z); $fail; } printf("exit grad op\\n"); }""" ) from string import Template return Template("\n".join(strings)).substitute(locals())
def c_code(self, node, nodename, inp, out, sub): dCdy, x, a, b, l, s = inp dydl, dyds = out fail = sub['fail'] function_name = "TimCropperGrad_%s" % nodename ndim_spatial = len(self.patch_shape) ndim_total = 2 + ndim_spatial strings = [] # check inputs for i, var in enumerate([dCdy, x]): strings.append(""" if (%(var)s->nd != $ndim_total) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have $ndim_total dimensions"); $fail; } """ % dict(var=var, i=i)) for i, var in enumerate((a, b, l, s)): strings.append(""" if (%(var)s->nd != 2) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have 2 dimensions"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[0] != CudaNdarray_HOST_DIMS($x)[0]) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[0] equal to batch size"); $fail; } if (CudaNdarray_HOST_DIMS(%(var)s)[1] != $ndim_spatial) { PyErr_SetString(PyExc_ValueError, "TimCropper: %(i)sth input must have shape[1] equal to number of spatial dimensions ($ndim_spatial)"); $fail; } """ % dict(var=var, i=2 + i)) # allocate outputs strings.append(""" int outdims[$ndim_total + 1]; outdims[$ndim_total] = $ndim_spatial; """) for i in range(ndim_total): strings.append("outdims[%i] = CudaNdarray_HOST_DIMS($dCdy)[%i];" % (i, i)) for var in "ls": strings.append((""" if ((NULL == %(dname)s) || """ + " || ".join( "(CudaNdarray_HOST_DIMS(%%(dname)s)[%i] != outdims[%i])" % (i, i) for i in range(ndim_total + 1)) + """) { Py_XDECREF(%(dname)s); %(dname)s = (CudaNdarray*)CudaNdarray_New(); if ((NULL == %(dname)s) || CudaNdarray_alloc_contiguous(%(dname)s, $ndim_total + 1, outdims)) { Py_XDECREF(%(dname)s); %(dname)s = NULL; PyErr_SetString(PyExc_ValueError, "TimCropperGrad: allocation of output %(dlabel)s failed"); $fail; } } """) % dict(name=locals()[var], dname=locals()["dyd" + var], dlabel="dyd" + var)) # launch kernel arguments = [] for var in "dydl dyds dCdy x".split(): arguments.append("CudaNdarray_SIZE($%s)" % var) arguments.append("CudaNdarray_DEV_DATA($%s)" % var) arguments.append("CudaNdarray_DEV_DIMS($%s)" % var) arguments.append("CudaNdarray_DEV_STRIDES($%s)" % var) arguments.extend("CudaNdarray_DEV_DATA($%s)" % var for var in "abls") gridblock = common.gridblock(ndim_spatial, "CudaNdarray_HOST_DIMS(%s)" % dCdy) strings.append(""" { printf("enter grad op\\n"); $gridblock $function_name<<<grid, block>>>(""" + ", \n".join(arguments) + """) CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (grid: %i x %i;" " block: %i x %i x %i)\\n", "$function_name", cudaGetErrorString(err), grid.x, grid.y, block.x, block.y, block.z); $fail; } printf("exit grad op\\n"); }""") from string import Template return Template("\n".join(strings)).substitute(locals())