def test_params_type_with_enums(self): # Test that we fail if we create a params type with common enum names inside different enum types. try: ParamsType(enum1=EnumList('A', 'B', 'C'), enum2=EnumList('A', 'B', 'F')) except AttributeError: pass else: raise Exception('ParamsType should fail with common enum names inside different enum types.') # Test that we fail if we create a params type with common names in both aliases and constants. try: ParamsType(enum1=EnumList(('A', 'a'), ('B', 'b')), enum2=EnumList(('ONE', 'a'), ('TWO', 'two'))) except AttributeError: ParamsType(enum1=EnumList(('A', 'a'), ('B', 'b')), enum2=EnumList(('ONE', 'one'), ('TWO', 'two'))) else: raise Exception('ParamsType should fail when there are aliases with same names as some constants.') # Test that we can access enum values through wrapper directly. w = ParamsType(enum1=EnumList('A', ('B', 'beta'), 'C'), enum2=EnumList(('D', 'delta'), 'E', 'F')) assert w.A == 0 and w.B == 1 and w.C == 2 assert w.D == 0 and w.E == 1 and w.F == 2 # Test constants access through aliases. assert w.enum_from_alias('beta') == w.B assert w.enum_from_alias('delta') == w.D assert w.enum_from_alias('C') == w.C # C is not an alias, so it should return a constant named C. # Test that other regular wrapper attributes are still available. assert len(w.fields) == len(w.types) == w.length assert w.name
class GpuAdvancedIncSubtensor1(Op): """ Implement AdvancedIncSubtensor1 on the gpu. """ _f16_ok = True __props__ = ('inplace', 'set_instead_of_inc') params_type = ParamsType( inplace=bool_t, set_instead_of_inc=bool_t, context=gpu_context_type, # following params are used into c_init_code_struct(), # as inputs are not available in that function. ndim_input_0=size_t, ndim_input_1=size_t, typecode_input_0=int_t, typecode_input_1=int_t) def __init__(self, inplace=False, set_instead_of_inc=False): self.inplace = inplace self.set_instead_of_inc = set_instead_of_inc if inplace: self.destroy_map = {0: [0]} def clone_inplace(self): return self.__class__(inplace=True, set_instead_of_inc=self.set_instead_of_inc) def make_node(self, x, y, ilist): ctx_name = infer_context_name(x, y) x_ = as_gpuarray_variable(x, ctx_name) y_ = as_gpuarray_variable(y, ctx_name) ilist_ = tensor.as_tensor_variable(ilist) assert x_.type.ndim >= y_.type.ndim if ilist_.type.dtype not in tensor.integer_dtypes: raise TypeError('index must be integers') if ilist_.type.ndim != 1: raise TypeError('index must be vector') if x_.type.ndim == 0: raise TypeError('cannot index into a scalar') if y_.type.ndim > x_.type.ndim: if self.set_instead_of_inc: opname = 'set' else: opname = 'increment' raise TypeError('cannot %s x subtensor with ndim=%s' ' by y with ndim=%s to x subtensor with ndim=%s ' % (opname, x_.type.ndim, y_.type.ndim)) return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) def get_params(self, node): return self.params_type.get_params( self, context=node.outputs[0].type.context, # following params are used into c_init_code_struct(). ndim_input_0=node.inputs[0].ndim, ndim_input_1=node.inputs[1].ndim, typecode_input_0=node.inputs[0].type.typecode, typecode_input_1=node.inputs[1].type.typecode) # We can't use the parent version that loops on each index # as we also need to loop when set_instead_of_inc is True and the # parent doesn't loop in that case. def perform(self, node, inp, out_, params=None): # TODO opt to make this inplace x, y, idx = inp out, = out_ if not self.inplace: x = x.copy() out[0] = x if len(idx) == 0: return # Make sure idx is not a GpuArray otherwise we cannot use its # content to index x and y (This is because we serve as # fallback for _dev20). if isinstance(idx, gpuarray.GpuArray): idx = np.asarray(idx) # If `y` has as many dimensions as `x`, then we want to iterate # jointly on `x` and `y`. Otherwise, it means `y` should be # broadcasted to fill all relevant rows of `x`. if y.ndim == x.ndim and y.shape[0] != 1: assert len(y) == len(idx) if self.set_instead_of_inc: for (j, i) in enumerate(idx): x[i] = y[j] else: k = get_iadd(node.inputs[0], node.inputs[1]) for (j, i) in enumerate(idx): k(x[i], y[j], broadcast=True) else: if y.ndim == x.ndim: # First dim is always 1 in this case. reshaped_y = y.reshape(y.shape[1:]) else: nb_dims_to_add = (x.ndim - 1) - y.ndim reshaped_y = y.reshape((1, ) * nb_dims_to_add + y.shape) if self.set_instead_of_inc: for i in idx: x[i] = reshaped_y else: k = get_iadd(node.inputs[0], node.inputs[1]) for i in idx: k(x[i], reshaped_y, broadcast=True) def c_headers(self): return [ '<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>', '<gpuarray/elemwise.h>', 'gpuarray_helper.h' ] def c_header_dirs(self): return [os.path.dirname(__file__)] def c_support_code_struct(self, node, nodename): return "\nGpuElemwise *iadd;\n" def c_init_code_struct(self, node, name, sub): return """ gpuelemwise_arg args[2] = {{0}}; args[0].name = "a"; args[0].typecode = %(params)s->typecode_input_0; args[0].flags = GE_READ|GE_WRITE; args[1].name = "b"; args[1].typecode = %(params)s->typecode_input_1; args[1].flags = GE_READ; iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b", 2, args, %(params)s->ndim_input_1, GE_CONVERT_F16); if (iadd == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); %(fail)s } """ % dict(params=sub['params'], fail=sub['fail']) def c_code(self, node, name, inputs, outputs, sub): if (node.inputs[0].ndim != node.inputs[1].ndim): raise NotImplementedError("This case does not have C code yet.") return """ PyGpuArrayObject *row_x, *row_y; size_t nd = %(params)s->ndim_input_0; ssize_t *start = NULL, *step = NULL; size_t num_indices, j; int ret; int broadcast_y; start = (ssize_t*)malloc(nd * sizeof(ssize_t)); step = (ssize_t*)malloc(nd * sizeof(ssize_t)); if (start == NULL || step == NULL) { PyErr_NoMemory(); %(fail)s } for (j = 0; j < nd; ++j) { start[j] = 0; step[j] = 1; } step[0] = 0; num_indices = PyArray_SIZE(%(ind)s); if (!%(params)s->inplace) { %(out)s = theano_try_copy(%(out)s, %(x)s); if (%(out)s == NULL) { // Exception already set %(fail)s } } else { Py_XDECREF(%(out)s); %(out)s = %(x)s; Py_INCREF(%(out)s); } if (num_indices != 0) { if ((num_indices - 1) > LONG_MAX) { PyErr_Format(PyExc_AssertionError, "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices); %(fail)s } broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1; for (j = 0; j < num_indices; j++) { start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j); if (start[0] < 0) start[0] += PyGpuArray_DIM(%(out)s, 0); if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) { PyErr_SetString(PyExc_IndexError, "index out of bounds"); %(fail)s; } row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step); if (row_x == NULL) %(fail)s; if (broadcast_y) start[0] = 0; else start[0] = j; row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step); if (row_y == NULL) { Py_DECREF(row_x); %(fail)s; } if (%(params)s->set_instead_of_inc) { ret = GpuArray_setarray(&row_x->ga, &row_y->ga); } else { void *args[2]; args[0] = (void *)&row_x->ga; args[1] = (void *)&row_y->ga; ret = GpuElemwise_call(iadd, args, GE_BROADCAST); } Py_DECREF(row_x); Py_DECREF(row_y); if (ret != GA_NO_ERROR) PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements"); } } free(start); free(step); """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], params=sub['params'], fail=""" { free(start); free(step); %(fail)s } """ % dict(fail=sub['fail'])) def c_code_cache_version(self): return (4, )
def test_hash_and_eq_params(self): wp1 = ParamsType( a=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) wp2 = ParamsType( a=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w1 = Params( wp1, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 == w2 assert not (w1 != w2) assert hash(w1) == hash(w2) # Changing attributes names only (a -> other_name). wp2_other = ParamsType( other_name=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w2 = Params( wp2_other, other_name=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing attributes values only (now a=2). w2 = Params( wp2, a=2, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing NumPy array values (5 -> -5). w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, -5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2
class GpuCumOp(GpuKernelBase, Op): """ Parameters ---------- axis Can not be None. If you want the array flattened, do it before. """ SUPPORTED_NDIMS = 3 __props__ = ("axis", "mode") params_type = ParamsType(axis=scalar.int32, context=gpu_context_type) def __init__(self, axis, mode="add"): assert axis is not None self.axis = int(axis) self.mode = mode def __eq__(self, other): if type(other) != type(self): return False return self.axis == other.axis and self.mode == other.mode def __hash__(self): return hash(self.axis) ^ hash(self.mode) def c_code_cache_version(self): return (7, ) def c_headers(self): return [ "<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>" ] def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, x): assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp" context_name = infer_context_name(x) x = as_gpuarray_variable(x, context_name) if x.ndim > GpuCumOp.SUPPORTED_NDIMS: raise NotImplementedError("Only cum op on 1D, 2D and\ 3D arrays are supported right now!") if self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError("axis(={0}) out of bounds".format(self.axis)) return Apply(self, [x], [x.type()]) def gpu_kernels(self, node, nodename): kernels = [] # cumadd kname = "k_cumadd" op = {"mul": "*", "add": "+"}[self.mode] k_var = "k_cumadd_" + nodename dtype_x = node.inputs[0].dtype flags = Kernel.get_flags(dtype_x) code = ("""#include "cluda.h" KERNEL void %(kname)s(float* input, ga_size input_offset, float* output, ga_size output_offset, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, const int offsetY, const int offsetZ, const int beforeLastElementIdx, const int lastElementIdx){ input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z; int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z; int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input; int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output; int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast]; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "intc", "intc", "intc", "intc", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # blockCumOp kname = "k_blockCumOp" k_var = "k_blockCumOp_" + nodename params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", gpuarray.GpuArray, gpuarray.SIZE, ] code = ("""#include "cluda.h" // helper functions WITHIN_KERNEL void k_reductionPhase(float* partialCumOp) { // Traverse down from leaves to root building partial sums at internal nodes in the tree. for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index < blockDim.x*2) { partialCumOp[index] %(op)s= partialCumOp[index - stride]; } } } WITHIN_KERNEL void k_fetchData(float* partialCumOp, float* input, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; partialCumOp[threadIdx.x*2] = input[idx_even]; partialCumOp[threadIdx.x*2 + 1] = input[idx_odd]; } WITHIN_KERNEL void k_reversePhase(float* partialCumOp) { // Traverse back up the tree building the scan from the partial sums for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index + stride < blockDim.x*2) { partialCumOp[index + stride] %(op)s= partialCumOp[index]; } } } WITHIN_KERNEL void k_pushData(float* partialCumOp, float* output, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { local_barrier(); // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] = partialCumOp[threadIdx.x*2]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1]; } KERNEL void k_blockCumOp(float* input, ga_size input_offset, float* output, ga_size output_offset, size_t nbElementsPerCumOp, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, int offsetY, int offsetZ, float* blockSum, ga_size blockSum_offset) { input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case. int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; // Check if current thread has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) { return; } extern __shared__ float partialCumOp[]; // Load data in shared memory k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ); // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree). // The tree is sweeped from the leaves to the root and from the root to the leaves. // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf k_reductionPhase(partialCumOp); k_reversePhase(partialCumOp); // Write the final output to global memory k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (blockSum != NULL){ if (threadIdx.x == blockDim.x - 1) { blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1]; } } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # k_finalCumOp kname = "k_finalCumOp" k_var = "k_finalCumOp_" + nodename code = ("""#include "cluda.h" KERNEL void k_finalCumOp(float* output, ga_size output_offset, float* blockSum, ga_size blockSum_offset, size_t nbElementsPerCumOp, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; // Check if current has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) return; int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ]; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] %(op)s= currentBlockSum; output[idx_odd] %(op)s= currentBlockSum; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels def c_code(self, node, nodename, inp, out, sub): if node.inputs[0].type.context.kind != b"cuda": raise NotImplementedError("cuda only") return """ const size_t* shape = PyGpuArray_DIMS(%(x)s); bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s); int axis = %(params)s->axis; if (axis < 0) { // Convert negative axis to positive axis. axis += PyGpuArray_NDIM(%(x)s); } if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) { %(fail)s; } { // Namespace for kernel calls // size_t max_threads_dim0; size_t max_grid_size1; size_t max_grid_size2; int err; err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2"); %(fail)s; } if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){ %(fail)s; } } """ % dict( x=inp[0], z=out[0], nodename=nodename, fail=sub["fail"], params=sub["params"], ) def c_support_code_struct(self, node, nodename): code = (""" int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) { size_t shape[3] = { 1, 1, 1 }; ssize_t inputStrides_x; ssize_t inputStrides_y; ssize_t inputStrides_z; ssize_t outputStrides_x; ssize_t outputStrides_y; ssize_t outputStrides_z; switch (PyGpuArray_NDIM(input)) { case 1: shape[0] = PyGpuArray_DIMS(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); break; case 2: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); break; case 3: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; shape[2] = PyGpuArray_DIMS(input)[2]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float); break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } if (shape[axis] <= 1) { int err = pygpu_move(output, input); return err; } // Perform cum op on array of even size. size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2); // Determine how many elements can be processed in one block. size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2; // Determine how many blocks are needed in total. size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op. size_t dimGridY; // Nb. of independent cum ops (width). size_t dimGridZ; // Nb. of independent cum ops (height). ssize_t tmp; switch (axis) { case 0: dimGridY = shape[1]; dimGridZ = shape[2]; break; case 1: dimGridY = shape[0]; dimGridZ = shape[2]; tmp = inputStrides_x; inputStrides_x = inputStrides_y; inputStrides_y = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_y; outputStrides_y = tmp; break; case 2: dimGridY = shape[1]; dimGridZ = shape[0]; tmp = inputStrides_x; inputStrides_x = inputStrides_z; inputStrides_z = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_z; outputStrides_z = tmp; break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ }; PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode, GA_C_ORDER, input->context, Py_None); if (deviceBlockSum == NULL){ return -1; } // Perform `maxGridY`*`maxGridZ` cum ops in parallel. for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){ size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY); for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ); size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t sharedBytes = (2*dimBlockX) * sizeof(float); int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed"); return -1; } if (dimGridX > 1) { // Do a cum op over the blockSum (recursive). if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){ Py_DECREF(deviceBlockSum); return -1; } // Since there are more than one block (i.e. `dimGridX > 1`) // report partial cum ops of previous blocks to subsequents ones. size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed"); return -1; } } // If shape[axis] is odd, the last element is compute manually if (shape[axis] != nbElementsPerCumOp){ size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {1, 1, 1}; int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "cumadd call failed"); return -1; } } } } Py_XDECREF(deviceBlockSum); return 0; } """ % locals()) return super(GpuCumOp, self).c_support_code_struct(node, nodename) + code
class BaseCorr3dMM(gof.OpenMPOp): """ Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and `Corr3dMM_gradInputs`. Cannot be used directly. Every sub-class must define internal attribute ``_direction`` out of __init__(). ``_direction`` must take one of following values: - "forward" to correlate bottom with weights and store results in top. - "backprop weights" to do a valid convolution of bottom with top (swapping the first two dimensions) and store results in weights. - "backprop inputs" to do a full convolution of top with weights (swapping the first two dimensions) and store results in bottom. Parameters ---------- border_mode : {'valid', 'full', 'half'} Additionally, the padding size could be directly specified by an integer or a tuple of three of integers subsample Perform subsampling of the output (default: (1, 1, 1)). filter_dilation Perform dilated correlation (default: (1, 1, 1)) num_groups Perform grouped convolutions (default: 1) """ check_broadcast = False __props__ = ("border_mode", "subsample", "filter_dilation", "num_groups") _direction = None params_type = ParamsType( direction=EnumList( ("DIRECTION_FORWARD", "forward"), # 0 ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"), # 1 ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"), ), # 2 dH=int64, dW=int64, dD=int64, dilH=int64, dilW=int64, dilD=int64, padH=int64, padW=int64, padD=int64, num_groups=int64, ) def __init__( self, border_mode="valid", subsample=(1, 1, 1), filter_dilation=(1, 1, 1), openmp=None, num_groups=1, ): super(BaseCorr3dMM, self).__init__(openmp=openmp) if isinstance(border_mode, integer_types): if border_mode < 0: raise ValueError("invalid border_mode {}, which must be a " "non-negative integer".format(border_mode)) border_mode = (border_mode, border_mode, border_mode) if isinstance(border_mode, tuple): if len(border_mode) != 3 or min(border_mode) < 0: raise ValueError( "invalid border_mode {}, which must be a tuple of " "three non-negative integers".format(border_mode)) pad_h, pad_w, pad_d = map(int, border_mode) border_mode = (pad_h, pad_w, pad_d) if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or border_mode in ("valid", "full", "half")): raise ValueError( "invalid border_mode {}, which must be either " '"valid", "full", "half", an integer or a tuple of three' " integers".format(border_mode)) self.border_mode = border_mode if len(subsample) != 3: raise ValueError("subsample must have three elements") if len(filter_dilation) != 3: raise ValueError("filter_dilation must have three elements") self.subsample = tuple(subsample) self.filter_dilation = tuple(filter_dilation) if num_groups < 1: raise ValueError("Number of groups should be greater than 0") self.num_groups = num_groups if not theano.config.blas.ldflags: # Theano will use a NumPy C implementation of [sd]gemm_ instead. self.blas_type = "" else: if "openblas" in theano.config.blas.ldflags: self.blas_type = "openblas" elif "mkl" in theano.config.blas.ldflags: self.blas_type = "mkl" else: self.blas_type = "" if self._direction not in [ "forward", "backprop weights", "backprop inputs" ]: raise ValueError("_direction must be one of 'forward', " "'backprop weights', 'backprop inputs'") @property def pad(self): if self.border_mode == "half": return (-1, -1, -1) elif self.border_mode == "full": return (-2, -2, -2) elif isinstance(self.border_mode, tuple): return self.border_mode else: assert self.border_mode == "valid" return (0, 0, 0) # Direction should be converted to real enum value, # as it is compared to integer later in c_code_helper(). direction = property( lambda self: self.params_type.enum_from_alias(self._direction)) dH = property(lambda self: self.subsample[0]) dW = property(lambda self: self.subsample[1]) dD = property(lambda self: self.subsample[2]) dilH = property(lambda self: self.filter_dilation[0]) dilW = property(lambda self: self.filter_dilation[1]) dilD = property(lambda self: self.filter_dilation[2]) padH = property(lambda self: self.pad[0]) padW = property(lambda self: self.pad[1]) padD = property(lambda self: self.pad[2]) def __str__(self): return "%s{%s, %s, %s, %s}" % ( self.__class__.__name__, self.border_mode, str(self.subsample), str(self.filter_dilation), str(self.num_groups), ) @staticmethod def as_common_dtype(in1, in2): """ Upcast input variables if necessary. """ dtype = theano.scalar.upcast(in1.dtype, in2.dtype) return in1.astype(dtype), in2.astype(dtype) def __setstate__(self, d): self.__dict__.update(d) if not hasattr(self, "num_groups"): self.num_groups = 1 def c_support_code(self): ccodes = blas_headers.blas_header_text() if self.blas_type == "openblas": ccodes += blas_headers.openblas_threads_text() elif self.blas_type == "mkl": ccodes += blas_headers.mkl_threads_text() return ccodes def c_libraries(self): return ldflags() def c_compile_args(self): compile_args = ldflags(libs=False, flags=True) compile_args += super(BaseCorr3dMM, self).c_compile_args() return compile_args def c_lib_dirs(self): return ldflags(libs=False, libs_dir=True) def c_header_dirs(self): return ldflags(libs=False, include_dir=True) def c_headers(self): headers = ["<stdio.h>"] headers += super(BaseCorr3dMM, self).c_headers() return headers def c_code_cache_version(self): # raise this whenever modifying any of the support_code_files return (8, self.openmp, blas_header_version()) def c_support_code_apply(self, node, nodename): # REMEMBER TO RAISE c_code_cache_version when changing any of # these files sub = {} dtype = str(node.__dict__["inputs"][0].dtype) assert dtype in ("float32", "float64") if dtype == "float32": sub["gemm"] = "sgemm_" sub["float_type"] = "npy_float" sub["float_typenum"] = "NPY_FLOAT" sub["n_bytes"] = 4 sub["c_float_type"] = "float" else: sub["gemm"] = "dgemm_" sub["float_type"] = "npy_double" sub["float_typenum"] = "NPY_DOUBLE" sub["n_bytes"] = 8 sub["c_float_type"] = "double" if self.openmp: sub["omp_flags"] = "#pragma omp parallel for schedule(static)" sub["omp_get_max_threads"] = "omp_get_max_threads()" sub["omp_get_thread_num"] = "omp_get_thread_num()" if self.blas_type == "openblas": sub["blas_set_num_threads"] = "openblas_set_num_threads" sub["blas_get_num_threads"] = "openblas_get_num_threads()" elif self.blas_type == "mkl": sub["blas_set_num_threads"] = "mkl_set_num_threads" sub["blas_get_num_threads"] = "mkl_get_max_threads()" else: sub["blas_set_num_threads"] = "" sub["blas_get_num_threads"] = "0" else: sub["omp_flags"] = "" sub["omp_get_max_threads"] = "1" sub["omp_get_thread_num"] = "0" sub["blas_set_num_threads"] = "" sub["blas_get_num_threads"] = "0" files = [os.path.join("c_code", "corr3d_gemm.c")] codes = [ open(os.path.join(os.path.split(__file__)[0], f)).read() for f in files ] final_code = "" for code in codes: final_code += code return final_code % sub def c_code_helper(self, bottom, weights, top, sub, height=None, width=None, depth=None): """ This generates the C code for Corr3dMM (direction="forward"), Corr3dMM_gradWeights (direction="backprop weights"), and Corr3dMM_gradInputs (direction="backprop inputs"). Depending on the direction, one of bottom, weights, top will receive the output, while the other two serve as inputs. :param bottom: Variable name of the input images in the forward pass, or the gradient of the input images in backprop wrt. inputs :param weights: Variable name of the filters in the forward pass, or the gradient of the filters in backprop wrt. weights :param top: Variable name of the output images / feature maps in the forward pass, or the gradient of the outputs in the backprop passes :param sub: Dictionary of substitutions useable to help generating the C code. :param height: If self.subsample[0] != 1, a variable giving the height of the filters for direction="backprop weights" or the height of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the height of the filters for direction="backprop weights". Ignored otherwise. :param width: If self.subsample[1] != 1, a variable giving the width of the filters for direction="backprop weights" or the width of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the width of the filters for direction="backprop weights". Ignored otherwise. :param depth: If self.subsample[1] != 1, a variable giving the depth of the filters for direction="backprop weights" or the depth of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the depth of the filters for direction="backprop weights". Ignored otherwise. """ # When subsampling, we cannot unambiguously infer the height and width # of bottom and weights from top, so we require them to be given. # Similarly, when border_mode="half", we cannot infer the weight size. if height: height = "(*(npy_int64 *)(PyArray_DATA(%s)))" % height else: if ((self.direction != 0) and (self.dH != 1)) or ((self.direction == 1) and (self.padH == -1)): raise ValueError( "height must be given for backprop with vertical sampling or border_mode='half'" ) height = "-1" if width: width = "(*(npy_int64 *)(PyArray_DATA(%s)))" % width else: if ((self.direction != 0) and (self.dW != 1)) or ((self.direction == 1) and (self.padW == -1)): raise ValueError( "width must be given for backprop with horizontal sampling or border_mode='half'" ) width = "-1" if depth: depth = "(*(npy_int64 *)(PyArray_DATA(%s)))" % depth else: if ((self.direction != 0) and (self.dD != 1)) or ((self.direction == 1) and (self.padD == -1)): raise ValueError( "depth must be given for backprop with depth sampling or border_mode='half'" ) depth = "-1" return """ // Mandatory args int direction = %(params)s->direction; // forward, bprop weights, bprop inputs // Optional args int dH = %(params)s->dH; int dW = %(params)s->dW; int dD = %(params)s->dD; int dilH = %(params)s->dilH; int dilW = %(params)s->dilW; int dilD = %(params)s->dilD; int padH = %(params)s->padH; int padW = %(params)s->padW; int padD = %(params)s->padD; int numgroups = %(params)s->num_groups; PyArrayObject * bottom = %(bottom)s; PyArrayObject * weights = %(weights)s; PyArrayObject * top = %(top)s; PyArrayObject * out2 = NULL; PyArrayObject **out = NULL; switch(%(params)s->direction) { case DIRECTION_FORWARD: out = &%(top)s; break; case DIRECTION_BACKPROP_WEIGHTS: out = &%(weights)s; break; case DIRECTION_BACKPROP_INPUTS: out = &%(bottom)s; break; default: PyErr_SetString(PyExc_ValueError, "CPU Corr3dMM: Invalid direction."); {%(fail)s} break; } // Obtain or infer kernel width, height and depth // (we need to know it early to be able to handle auto-padding) int kH, kW, kD, dil_kH, dil_kW, dil_kD; if (direction != 1) { // weight is an input variable, we can just read its shape kH = PyArray_DIMS(weights)[2]; kW = PyArray_DIMS(weights)[3]; kD = PyArray_DIMS(weights)[4]; } else { if (%(height)s != -1) { // kernel height is specified (perhaps vertical subsampling or half padding) kH = %(height)s; } else if (padH == -2) { // vertical full padding, we can infer the kernel height kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1; } else { // explicit padding, we can infer the kernel height kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1; } if (%(width)s != -1) { kW = %(width)s; } else if (padW == -2) { kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } else { kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } if (%(depth)s != -1) { kD = %(depth)s; } else if (padD == -2) { kD = (2 - PyArray_DIMS(bottom)[4] + (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1; } else { kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1; } } // Implicit dilated kernel size dil_kH = (kH - 1) * dilH + 1; dil_kW = (kW - 1) * dilW + 1; dil_kD = (kD - 1) * dilD + 1; // Auto-padding if requested if (padH == -1) { // vertical half padding padH = dil_kH / 2; } else if (padH == -2) { // vertical full padding padH = dil_kH - 1; } else if (padH < 0) { PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padH must be >= -2"); %(fail)s } if (padW == -1) { // horizontal half padding padW = dil_kW / 2; } else if (padW == -2) { // horizontal full padding padW = dil_kW - 1; } else if (padW < 0) { PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padW must be >= -2"); %(fail)s } if (padD == -1) { // depth half padding padD = dil_kD / 2; } else if (padD == -2) { // depth full padding padD = dil_kD - 1; } else if (padD < 0) { PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padD must be >= -2"); %(fail)s } // Infer output shape npy_intp out_dim[5]; switch(direction) { case 0: // forward pass // output is top: (batchsize, num_filters, height, width, depth) // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1 out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0]; out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1); out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1); out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1); if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) { PyErr_Format(PyExc_ValueError, "Corr3dMM: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(bottom)[4], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)out_dim[4]); %(fail)s } break; case 1: // backprop wrt. weights // output is weights: (num_filters, num_channels, height, width, depth) // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups; out_dim[2] = (npy_intp)kH; // already inferred further above out_dim[3] = (npy_intp)kW; // how convenient out_dim[4] = (npy_intp)kD; if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) { PyErr_Format(PyExc_ValueError, "Corr3dMM backprop wrt. weights: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(bottom)[4], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)out_dim[4], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3], (long int)PyArray_DIMS(top)[4]); %(fail)s } break; case 2: // backprop wrt. inputs // output is bottom: (batchsize, num_channels, height, width, depth) // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad out_dim[0] = (npy_intp)PyArray_DIMS(top)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups; out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH); out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW); out_dim[4] = (npy_intp)((%(depth)s != -1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD); if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) { PyErr_Format(PyExc_ValueError, "Corr3dMM backprop wrt. inputs: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)out_dim[4], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3], (long int)PyArray_DIMS(top)[4]); %(fail)s } break; default: PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n"); %(fail)s } // Prepare output array int typenum; if ( !(*out && PyArray_NDIM(*out)==4 && PyArray_IS_C_CONTIGUOUS(*out) && PyArray_DIMS(*out)[0]==out_dim[0] && PyArray_DIMS(*out)[1]==out_dim[1] && PyArray_DIMS(*out)[2]==out_dim[2] && PyArray_DIMS(*out)[3]==out_dim[3] && PyArray_DIMS(*out)[4]==out_dim[4])) { Py_XDECREF(*out); if (direction != 1) { typenum = PyArray_TYPE(weights); } else { typenum = PyArray_TYPE(bottom); } //Change to PyArray_ZEROS which is faster than PyArray_EMPTY. *out = (PyArrayObject*)PyArray_ZEROS(5, out_dim, typenum, 0); if (NULL == *out) { PyErr_Format(PyExc_RuntimeError, "BaseCorr3dMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld", (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4]); %(fail)s } } // Call corr3dMM code out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups); if (out2==NULL){ %(fail)s } assert (out2 == *out); """ % dict( bottom=bottom, weights=weights, top=top, height=height, width=width, depth=depth, fail=sub["fail"], params=sub["params"], )
class GpuExtractDiag2D(GpuKernelBase, Op): """ Extracting diagonal of a 2D matrix on the GPU. """ __props__ = ('context_name', 'keepdims') _f16_ok = True params_type = ParamsType(context=gpu_context_type, keepdims=bool_t) def __init__(self, context_name=None, keepdims=False): self.context_name = context_name self.keepdims = keepdims def get_params(self, node): return self.params_type.get_params(self, context=get_context( self.context_name), keepdims=self.keepdims) def make_node(self, x, k=0): #TODO: dtype check x = as_gpuarray_variable(x, context_name=self.context_name) k = tensor.as_tensor_variable(k) assert x.ndim == 2 assert k.ndim == 0 broadcastable = (False, True) if self.keepdims else (False, ) otype = GpuArrayType(dtype=x.type.dtype, broadcastable=broadcastable, context_name=self.context_name) return gof.Apply(self, [x, k], [otype()]) def infer_shape(self, node, in_shapes): in_shape, _ = in_shapes dim1 = in_shape[0] dim2 = in_shape[1] k = node.inputs[1] diag_size = T.switch(T.ge(k, 0), T.clip(dim2 - k, 0, dim1), T.clip(dim1 + k, 0, dim2)) if self.keepdims: diag_size = (diag_size, 1) else: diag_size = (diag_size, ) return [diag_size] def grad(self, inp, grads): return [ GpuAllocDiag2D()(grads[0], inp[1], *(inp[0].shape)), grad_not_implemented(self, 1, inp[1]) ] def gpu_kernels(self, node, name): dtype_x = node.inputs[0].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) dtype_y = node.outputs[0].dtype type_y = gpuarray.dtype_to_ctype(dtype_y) work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x)) load_x = load_w(dtype_x) write_y = write_w(dtype_y) code = """ #include "cluda.h" KERNEL void extract(const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_ssize k, ga_size l) { x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off); y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off); ga_ssize coff = max(k, (ga_ssize) 0); ga_ssize roff = -min(k, (ga_ssize) 0); ga_size index = GID_0 * LDIM_0 + LID_0; if (index < l) { %(work_x)s t = %(load_x)s(x[(index + roff) * stridesX0 + (index + coff) * stridesX1]); y[index * stridesY0] = %(write_y)s(t); } }""" % dict(type_x=type_x, type_y=type_y, work_x=work_x, load_x=load_x, write_y=write_y, name=name) return [ Kernel(code=code, name="extract", params=[ gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SIZE ], flags=Kernel.get_flags(dtype_x, dtype_y), objvar='k_extract_' + name) ] def c_headers(self): return [ '<numpy_compat.h>', '<gpuarray_helper.h>', '<gpuarray/types.h>' ] def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def c_code(self, node, name, inp, out, sub): #TODO: fix error msg x, k = inp y, = out fail = sub['fail'] params = sub['params'] typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) kname = self.gpu_kernels(node, name)[0].objvar s = """ int err; size_t* dims = (size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(x)s); size_t k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0]; size_t col_off = (size_t) (k > 0?k:0); size_t row_off = (size_t) (k < 0?-k:0); size_t diag_size = (size_t) std::max((ssize_t) std::min((ssize_t)dims[0] - (ssize_t)row_off, (ssize_t)dims[1] - (ssize_t)col_off), (ssize_t) 0); size_t ls = std::min(diag_size, (size_t) 1024); size_t gs = (diag_size + ls - 1) / ls; size_t ndims = %(params)s->keepdims ? 2 : 1; size_t out_dims[ndims]; out_dims[0] = diag_size; if (ndims == 2) { out_dims[1] = 1; } size_t itemsize_x = 1; size_t itemsize_y = 1; ssize_t stridesX0 = 1; ssize_t stridesX1 = 1; ssize_t stridesY0 = 1; if (%(y)s == NULL || %(y)s->ga.nd != ndims || %(y)s->ga.dimensions[0] != diag_size || (ndims > 1 && %(y)s->ga.dimensions[1] != 1)) { Py_CLEAR(%(y)s); %(y)s = pygpu_empty(ndims, out_dims, %(typecode)s, GA_C_ORDER, %(params)s->context, Py_None); } if (%(y)s == NULL) { %(fail)s } itemsize_x = GpuArray_ITEMSIZE(&%(x)s->ga); itemsize_y = GpuArray_ITEMSIZE(&%(y)s->ga); stridesX0 = PyGpuArray_STRIDES(%(x)s)[0] / itemsize_x; stridesX1 = PyGpuArray_STRIDES(%(x)s)[1] / itemsize_x; stridesY0 = PyGpuArray_STRIDES(%(y)s)[0] / itemsize_y; if (row_off < dims[0] && col_off < dims[1]) { err = extract_call(1, &gs, &ls, 0, stridesX0, stridesX1, %(x)s->ga.data, %(x)s->ga.offset, stridesY0, %(y)s->ga.data, %(y)s->ga.offset, k, diag_size); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: kExtract: %%s. n%%lu, m=%%lu.", GpuKernel_error(&%(kname)s, err), (unsigned long)dims[0], (unsigned long)dims[1]); %(fail)s; } } else { %(fail)s; } """ % locals() return s def c_code_cache_version(self): return (1, )
def test_params_type_filtering(self): shape_tensor5 = (1, 2, 2, 3, 2) size_tensor5 = shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[2] * shape_tensor5[3] * shape_tensor5[4] random_tensor = np.random.normal(size=size_tensor5).reshape(shape_tensor5) w = ParamsType(a1=TensorType('int32', (False, False)), a2=TensorType('float64', (False, False, False, False, False)), a3=Generic()) # With a value that does not match the params type. o = Params(w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int64'), a2=random_tensor.astype('float32'), a3=2000) # should fail (o.a1 is not int32, o.a2 is not float64) self.assertRaises(TypeError, w.filter, o, True) # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed) self.assertRaises(TypeError, w.filter, o, False, False) # Should pass. w.filter(o, strict=False, allow_downcast=True) # With a value that matches the params type. o1 = Params(w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=random_tensor.astype('float64'), a3=2000) # All should pass. w.filter(o1, strict=True) w.filter(o1, strict=False, allow_downcast=False) w.filter(o1, strict=False, allow_downcast=True) # Check values_eq and values_eq_approx. o2 = Params(w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=random_tensor.astype('float64'), a3=2000) assert w.values_eq(o1, o2) assert w.values_eq_approx(o1, o2) # Check value_eq_approx. # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx. # So, I just play a little with float values. o3 = Params(w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=(random_tensor.astype('float32') * 10 / 2.2 * 2.19999999999 / 10).astype('float64'), a3=2000.0 - 0.00000000000000001) assert w.values_eq_approx(o1, o3)
class CumOp(Op): # See function cumsum/cumprod for docstring __props__ = ("axis", "mode") check_input = False params_type = ParamsType(c_axis=int_t, mode=EnumList(("MODE_ADD", "add"), ("MODE_MUL", "mul"))) def __init__(self, axis=None, mode="add"): if mode not in ("add", "mul"): raise ValueError('{}: Unknown mode "{}"'.format( type(self).__name__, mode)) self.axis = axis self.mode = mode c_axis = property(lambda self: np.MAXDIMS if self.axis is None else self.axis) def make_node(self, x): x = basic.as_tensor_variable(x) out_type = x.type() if self.axis is None: out_type = theano.tensor.vector(dtype=x.dtype) # Flatten elif self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError("axis(={}) out of bounds".format(self.axis)) return theano.Apply(self, [x], [out_type]) def perform(self, node, inputs, output_storage, params): x = inputs[0] z = output_storage[0] if self.mode == "add": z[0] = np.cumsum(x, axis=self.axis) else: z[0] = np.cumprod(x, axis=self.axis) def grad(self, inputs, output_gradients): (x, ) = inputs (gi, ) = output_gradients if self.axis is None: if self.mode == "add": return [cumsum(gi[::-1])[::-1].reshape(x.shape)] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [cumsum((fx * gi)[::-1])[::-1].reshape(x.shape) / x] else: raise NotImplementedError( '%s: unknown gradient for mode "%s"' % (type(self).__name__, self.mode)) reverse_slicing = [slice(None, None, None)] * gi.ndim reverse_slicing[self.axis] = slice(None, None, -1) reverse_slicing = tuple(reverse_slicing) # We need to reverse the gradients along ``self.axis``, # compute cumsum, then reverse again if self.mode == "add": return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [ cumsum( (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x ] else: raise NotImplementedError( '{}: unknown gradient for mode "{}"'.format( type(self).__name__, self.mode)) def infer_shape(self, node, shapes): if self.axis is None: return [(basic.prod(shapes[0]), )] # Flatten return shapes def c_code(self, node, name, inames, onames, sub): (x, ) = inames (z, ) = onames axis = self.axis fail = sub["fail"] params = sub["params"] code = (""" int axis = %(params)s->c_axis; if (axis == 0 && PyArray_NDIM(%(x)s) == 1) axis = NPY_MAXDIMS; npy_intp shape[1] = { PyArray_SIZE(%(x)s) }; if(axis == NPY_MAXDIMS && !(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0])) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s)); } else if(axis != NPY_MAXDIMS && !(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)))) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE(%(x)s)); } if (!%(z)s) %(fail)s; { PyObject * t = NULL; if(%(params)s->mode == MODE_ADD) t = PyArray_CumSum( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); else if(%(params)s->mode == MODE_MUL) t = PyArray_CumProd( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); if (!t){ %(fail)s; } // Because PyArray_CumSum/CumProd returns a newly created reference on t. Py_XDECREF(t); } """ % locals()) return code def c_code_cache_version(self): return (8, ) def __str__(self): return "{}{{{}, {}}}".format(self.__class__.__name__, self.axis, self.mode)
class GpuMaxPoolRop(CGpuKernelBase): """ Implements the R-operator for the downsample operation. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border self.mode = mode CGpuKernelBase.__init__( self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)" ) assert mode == "max" assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, eval_point, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 eval_point = as_gpuarray_variable(eval_point, ctx_name) assert eval_point.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in theano.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in theano.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in theano.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = theano.tensor.cast(ws, "int64") stride = theano.tensor.cast(stride, "int64") pad = theano.tensor.cast(pad, "int64") return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()]) def infer_shape(self, fgraph, node, in_shapes): ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]] shp = Pool.out_shape( in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim ) return [shp]
class GpuPool(CGpuKernelBase): """ Implement the max and average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType( ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type ) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)") assert PoolingMode_t.has_alias(self.mode) assert self.ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) inp = as_gpuarray_variable(inp, ctx_name) nd = self.ndim assert inp.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in theano.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in theano.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in theano.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = theano.tensor.cast(ws, "int64") stride = theano.tensor.cast(stride, "int64") pad = theano.tensor.cast(pad, "int64") return Apply(self, [inp, ws, stride, pad], [inp.type()]) def infer_shape(self, fgraph, node, in_shapes): ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]] shp = Pool.out_shape( in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim ) return [shp] def grad(self, inp, grads): img, ws, stride, pad = inp (grad,) = grads grad = gpu_contiguous(grad) disc = [theano.gradient.DisconnectedType()() for i in inp[1:]] if self.mode == "max": out = self(img, ws, stride, pad) g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)( img, out, grad, ws, stride, pad ) return [g_out] + disc else: g_out = GpuAveragePoolGrad( ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode )(img, grad, ws, stride, pad) return [g_out] + disc def connection_pattern(self, node): return [[1], [0], [0], [0]] def R_op(self, inputs, eval_points): if self.mode != "max": # Rop for average or sum is simply pooling evaluated at eval point eval_inputs = [eval_points[0]] + inputs[1:] return [self(*eval_inputs)] # R_op can receive None as eval_points. # That mean there is no diferientiable path through that input # If this imply that you cannot compute some outputs, # return None for those. if eval_points[0] is None: return [None] z = self(*inputs) x, ws, stride, pad = inputs return [ GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)( x, z, eval_points[0], ws, stride, pad ) ]
class GpuAveragePoolGrad(CGpuKernelBase): """ Implement the grad of average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__( self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)" ) assert mode in ("sum", "average_inc_pad", "average_exc_pad") assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, out_grad, ws, stride=None, pad=None): ctx_name = infer_context_name(inp, out_grad) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 out_grad = as_gpuarray_variable(out_grad, ctx_name) assert out_grad.ndim == nd + 2 assert out_grad.ndim == inp.ndim if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.mode == "average_exc_pad": raise ValueError("Padding must be zero for average_exc_pad") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in theano.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in theano.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in theano.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = theano.tensor.cast(ws, "int64") stride = theano.tensor.cast(stride, "int64") pad = theano.tensor.cast(pad, "int64") return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()]) def infer_shape(self, fgraph, node, in_shapes): return [in_shapes[0]] def grad(self, inp, grads): x, gz, ws, stride, pad = inp (ggx,) = grads return [ theano.tensor.zeros_like(x), GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)( ggx, ws, stride, pad ), ] + [theano.gradient.DisconnectedType()() for i in inp[2:]] def connection_pattern(self, node): return [[1], [1], [0], [0], [0]]
def params_type(self): return ParamsType(i=theano.scalar.basic.int64)
class GpuMagmaEigh(GpuMagmaBase): """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma library. Parameters ---------- UPLO : Specifies whether the calculation is done with the lower triangular part of matrix (`L`, default) or the upper triangular part (`U`). compute_v : If `True`, computes eigenvalues and eigenvectors (`True`, default). If `False`, computes only eigenvalues of matrix. """ __props__ = ("lower", "compute_v") _cop_num_inputs = 1 _cop_num_outputs = 2 check_input = False params_type = ParamsType(lower=bool_t, compute_v=bool_t, context=gpu_context_type) def __init__(self, UPLO="L", compute_v=True): assert UPLO in ["L", "U"] self.lower = UPLO == "L" self.compute_v = compute_v COp.__init__(self, ["c_code/magma_eigh.c"], "APPLY_SPECIFIC(magma_eigh)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_v: return theano.Apply( self, [A], # return D, V [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)(), A.type(), ], ) else: return theano.Apply( self, [A], # return D [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)() ], ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context)
class BaseCorrMM(gof.OpenMPOp): """ Base class for `CorrMM`, `CorrMM_gradWeights` and `CorrMM_gradInputs`. Cannot be used directly. Every sub-class must define internal attribute ``_direction`` out of __init__(). ``_direction`` must take one of following values: - "forward" to correlate bottom with weights and store results in top. - "backprop weights" to do a valid convolution of bottom with top (swapping the first two dimensions) and store results in weights. - "backprop inputs" to do a full convolution of top with weights (swapping the first two dimensions) and store results in bottom. Parameters ---------- border_mode : {'valid', 'full', 'half'} Additionally, the padding size could be directly specified by an integer, a pair of integers, or two pairs of integers. subsample Perform subsampling of the output (default: (1, 1)). filter_dilation Perform dilated correlation (default: (1,1)) num_groups Perform grouped convolutions (default: 1) unshared Perform unshared correlation (default: False) """ check_broadcast = False __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared') _direction = None params_type = ParamsType( direction=EnumList( ('DIRECTION_FORWARD', 'forward'), # 0 ('DIRECTION_BACKPROP_WEIGHTS', 'backprop weights'), # 1 ('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')), # 2 dH=int64, dW=int64, dilH=int64, dilW=int64, padH_l=int64, padH_r=int64, padW_l=int64, padW_r=int64, num_groups=int64, unshared=int8) def __init__(self, border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None): super(BaseCorrMM, self).__init__(openmp=openmp) if isinstance(border_mode, integer_types): if border_mode < 0: raise ValueError('invalid border_mode {}, which must be a ' 'non-negative integer'.format(border_mode)) border_mode = ((border_mode, border_mode), ) * 2 elif isinstance(border_mode, tuple): if len(border_mode) != 2: raise ValueError('invalid border_mode {} which must be a ' 'tuple of length 2'.format(border_mode)) border = () for mode in border_mode: if isinstance(mode, tuple) and len(mode) == 2 and \ min(mode) >= 0: border += ((int(mode[0]), int(mode[1])), ) elif mode >= 0: border += ((int(mode), int(mode)), ) else: raise ValueError( 'invalid border mode {}. The tuple can only contain ' 'integers or tuples of length 2'.format(border_mode)) border_mode = border elif border_mode not in ('valid', 'full', 'half'): raise ValueError( 'invalid border_mode {}, which must be either ' '"valid", "full", "half", an integer or a tuple ' 'of two integers or a pair of integers'.format(border_mode)) self.border_mode = border_mode if len(subsample) != 2: raise ValueError("subsample must have two elements") if len(filter_dilation) != 2: raise ValueError("filter_dilation must have two elements") self.subsample = tuple(subsample) self.filter_dilation = tuple(filter_dilation) self.unshared = unshared if not theano.config.blas.ldflags: # Theano will use a NumPy C implementation of [sd]gemm_ instead. self.blas_type = '' else: if 'openblas' in theano.config.blas.ldflags: self.blas_type = 'openblas' elif 'mkl' in theano.config.blas.ldflags: self.blas_type = 'mkl' else: self.blas_type = '' if self._direction not in [ "forward", "backprop weights", "backprop inputs" ]: raise ValueError("_direction must be one of 'forward', " "'backprop weights', 'backprop inputs'") if num_groups < 1: raise ValueError("Number of groups should be greater than 0") self.num_groups = num_groups @property def pad(self): if self.border_mode == "half": return ((-1, -1), ) * 2 elif self.border_mode == "full": return ((-2, -2), ) * 2 elif isinstance(self.border_mode, tuple): return self.border_mode else: assert self.border_mode == "valid" return ((0, 0), ) * 2 # Direction should be converted to real enum value, # as it is compared to integer later in c_code_helper(). direction = property( lambda self: self.params_type.enum_from_alias(self._direction)) dH = property(lambda self: self.subsample[0]) dW = property(lambda self: self.subsample[1]) dilH = property(lambda self: self.filter_dilation[0]) dilW = property(lambda self: self.filter_dilation[1]) padH_l = property(lambda self: self.pad[0][0]) padH_r = property(lambda self: self.pad[0][1]) padW_l = property(lambda self: self.pad[1][0]) padW_r = property(lambda self: self.pad[1][1]) def __str__(self): return '%s{%s, %s, %s, %s %s}' % ( self.__class__.__name__, self.border_mode, str( self.subsample), str(self.filter_dilation), str( self.num_groups), str(self.unshared)) @staticmethod def as_common_dtype(in1, in2): """ Upcast input variables if necessary. """ dtype = theano.scalar.upcast(in1.dtype, in2.dtype) return in1.astype(dtype), in2.astype(dtype) def __setstate__(self, d): self.__dict__.update(d) if not hasattr(self, 'num_groups'): self.num_groups = 1 def c_support_code(self): ccodes = blas_headers.blas_header_text() if self.blas_type == 'openblas': ccodes += blas_headers.openblas_threads_text() elif self.blas_type == 'mkl': ccodes += blas_headers.mkl_threads_text() return ccodes def c_libraries(self): return ldflags() def c_compile_args(self): compile_args = ldflags(libs=False, flags=True) compile_args += super(BaseCorrMM, self).c_compile_args() return compile_args def c_lib_dirs(self): return ldflags(libs=False, libs_dir=True) def c_header_dirs(self): return ldflags(libs=False, include_dir=True) def c_headers(self): headers = ['<stdio.h>'] headers += super(BaseCorrMM, self).c_headers() return headers def c_code_cache_version(self): # raise this whenever modifying any of the support_code_files return (10, self.openmp, blas_header_version()) def c_support_code_apply(self, node, nodename): # REMEMBER TO RAISE c_code_cache_version when changing any of # these files sub = {} dtype = str(node.__dict__['inputs'][0].dtype) assert dtype in ('float32', 'float64') if dtype == 'float32': sub['gemm'] = 'sgemm_' sub['gemv'] = 'sgemv_' sub['float_type'] = 'npy_float' sub['float_typenum'] = 'NPY_FLOAT' sub['n_bytes'] = 4 sub['c_float_type'] = 'float' else: sub['gemm'] = 'dgemm_' sub['gemv'] = 'dgemv_' sub['float_type'] = 'npy_double' sub['float_typenum'] = 'NPY_DOUBLE' sub['n_bytes'] = 8 sub['c_float_type'] = 'double' if self.openmp: sub['omp_flags'] = '#pragma omp parallel for schedule(static)' sub['omp_get_max_threads'] = 'omp_get_max_threads()' sub['omp_get_thread_num'] = 'omp_get_thread_num()' if self.blas_type == 'openblas': sub['blas_set_num_threads'] = 'openblas_set_num_threads' sub['blas_get_num_threads'] = 'openblas_get_num_threads()' elif self.blas_type == 'mkl': sub['blas_set_num_threads'] = 'mkl_set_num_threads' sub['blas_get_num_threads'] = 'mkl_get_max_threads()' else: sub['blas_set_num_threads'] = '' sub['blas_get_num_threads'] = '0' else: sub['omp_flags'] = '' sub['omp_get_max_threads'] = '1' sub['omp_get_thread_num'] = '0' sub['blas_set_num_threads'] = '' sub['blas_get_num_threads'] = '0' files = [os.path.join('c_code', 'corr_gemm.c')] codes = [ open(os.path.join(os.path.split(__file__)[0], f)).read() for f in files ] final_code = '' for code in codes: final_code += code return final_code % sub def c_code_helper(self, bottom, weights, top, sub, height=None, width=None): """ This generates the C code for CorrMM (direction="forward"), CorrMM_gradWeights (direction="backprop weights"), and CorrMM_gradInputs (direction="backprop inputs"). Depending on the direction, one of bottom, weights, top will receive the output, while the other two serve as inputs. :param bottom: Variable name of the input images in the forward pass, or the gradient of the input images in backprop wrt. inputs :param weights: Variable name of the filters in the forward pass, or the gradient of the filters in backprop wrt. weights :param top: Variable name of the output images / feature maps in the forward pass, or the gradient of the outputs in the backprop passes :param sub: Dictionary of substitutions useable to help generating the C code. :param height: If self.subsample[0] != 1, a variable giving the height of the filters for direction="backprop weights" or the height of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the height of the filters for direction="backprop weights". Ignored otherwise. :param width: If self.subsample[1] != 1, a variable giving the width of the filters for direction="backprop weights" or the width of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the width of the filters for direction="backprop weights". Ignored otherwise. """ # When subsampling, we cannot unambiguously infer the height and width # of bottom and weights from top, so we require them to be given. # Similarly, when border_mode="half", we cannot infer the weight size. if height: height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height else: if ((self.direction != 0) and (self.dH != 1)) or ((self.direction == 1) and (self.padH_l == -1 or self.padH_r == -1)): raise ValueError( "height must be given for backprop with vertical sampling or border_mode='half'" ) height = '-1' if width: width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width else: if ((self.direction != 0) and (self.dW != 1)) or ((self.direction == 1) and (self.padW_l == -1 or self.padW_r == -1)): raise ValueError( "width must be given for backprop with horizontal sampling or border_mode='half'" ) width = '-1' return """ // Mandatory args int direction = %(params)s->direction; // forward, bprop weights, bprop inputs // Optional args int dH = %(params)s->dH; int dW = %(params)s->dW; int dilH = %(params)s->dilH; int dilW = %(params)s->dilW; int padH_l = %(params)s->padH_l; int padH_r = %(params)s->padH_r; int padW_l = %(params)s->padW_l; int padW_r = %(params)s->padW_r; int numgroups = %(params)s->num_groups; int unshared = %(params)s->unshared; PyArrayObject * bottom = %(bottom)s; PyArrayObject * weights = %(weights)s; PyArrayObject * top = %(top)s; PyArrayObject * out2 = NULL; PyArrayObject **out = NULL; switch(%(params)s->direction) { case DIRECTION_FORWARD: out = &%(top)s; break; case DIRECTION_BACKPROP_WEIGHTS: out = &%(weights)s; break; case DIRECTION_BACKPROP_INPUTS: out = &%(bottom)s; break; default: PyErr_SetString(PyExc_ValueError, "CPU CorrMM: Invalid direction."); {%(fail)s} break; } int wdim, odim; wdim = unshared ? 6 : 4; odim = 4; //Can be set to 6 later for unshared backprop wrt weights // Obtain or infer kernel width and height // (we need to know it early to be able to handle auto-padding) int kH, kW, dil_kH, dil_kW; if (direction != 1) { // weight is an input variable, we can just read its shape kH = PyArray_DIMS(weights)[wdim-2]; kW = PyArray_DIMS(weights)[wdim-1]; } else { if (%(height)s != -1) { // kernel height is specified (perhaps vertical subsampling or half padding) kH = %(height)s; } else if (padH_l == -2 || padH_r == -2) { // vertical full padding, we can infer the kernel height kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1; } else { // explicit padding, we can infer the kernel height kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1; } if (%(width)s != -1) { // kernel width is specified (perhaps horizontal subsampling or half padding) kW = %(width)s; } else if (padW_l == -2 || padW_r == -2) { kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } else { kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } } // Implicit dilated kernel size dil_kH = (kH - 1) * dilH + 1; dil_kW = (kW - 1) * dilW + 1; // Auto-padding if requested if (padH_l == -1 || padH_r == -1) { // vertical half padding padH_l = padH_r = dil_kH / 2; } else if (padH_l == -2 || padH_r == -2) { // vertical full padding padH_l = padH_r = dil_kH - 1; } else if (padH_l < -2 || padH_r < -2) { PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2"); %(fail)s } if (padW_l == -1 || padW_r == -1) { // horizontal half padding padW_l = padW_r = dil_kW / 2; } else if (padW_l == -2 || padW_r == -2) { // horizontal full padding padW_l = padW_r = dil_kW - 1; } else if (padW_l < -2 || padW_r < -2) { PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2"); %(fail)s } // Infer output shape npy_intp out_dim[6]; out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights switch(direction) { case 0: // forward pass // output is top: (batchsize, num_filters, height, width) // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1 out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0]; out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1); out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1); if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { if (unshared) { PyErr_Format(PyExc_ValueError, "CorrMM: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3]); } else { PyErr_Format(PyExc_ValueError, "CorrMM: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3]); } %(fail)s } break; case 1: // backprop wrt. weights // output is weights: (num_filters, num_channels, height, width) // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1 out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; if (unshared){ odim = 6; out_dim[1] = (npy_intp)PyArray_DIMS(top)[2]; out_dim[2] = (npy_intp)PyArray_DIMS(top)[3]; } out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups; out_dim[wdim-2] = (npy_intp)kH; // already inferred further above out_dim[wdim-1] = (npy_intp)kW; // how convenient if (unshared) { if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0 || out_dim[4] <= 0 || out_dim[5] <= 0){ PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. weights: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } else { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. weights: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } break; case 2: // backprop wrt. inputs // output is bottom: (batchsize, num_channels, height, width) // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad out_dim[0] = (npy_intp)PyArray_DIMS(top)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups; out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r); out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r); if (unshared) { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. inputs: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } else { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. inputs: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } break; default: PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n"); %(fail)s } // Prepare output array int typenum; int failure; failure = !(*out && PyArray_NDIM(*out)==odim && PyArray_IS_C_CONTIGUOUS(*out) && PyArray_DIMS(*out)[0]==out_dim[0] && PyArray_DIMS(*out)[1]==out_dim[1] && PyArray_DIMS(*out)[2]==out_dim[2] && PyArray_DIMS(*out)[3]==out_dim[3]); if (odim == 6){ failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4] && PyArray_DIMS(*out)[5]==out_dim[5]); } if ( failure ) { Py_XDECREF(*out); if (direction != 1) { typenum = PyArray_TYPE(weights); } else { typenum = PyArray_TYPE(bottom); } //Change to PyArray_ZEROS which is faster than PyArray_EMPTY. *out = (PyArrayObject*)PyArray_ZEROS(odim, out_dim, typenum, 0); if (NULL == *out) { if (odim == 4) { PyErr_Format(PyExc_RuntimeError, "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld", (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]); } if (odim == 6) { PyErr_Format(PyExc_RuntimeError, "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld", (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4], (long long)out_dim[5]); } %(fail)s } } // Call corrMM code out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, numgroups, unshared); if (out2==NULL){ %(fail)s } assert (out2 == *out); """ % dict(bottom=bottom, weights=weights, top=top, height=height, width=width, fail=sub['fail'], params=sub['params'])
class GpuBinarySearchSorted(GpuKernelBase, Op): """ Searchsorted on GPU """ __props__ = ('context_name', 'dtype_int64') _f16_ok = True params_type = ParamsType(context=gpu_context_type, dtype_int64=bool_t) def __init__(self, context_name=None, dtype_int64=False): self.context_name = context_name self.dtype_int64 = dtype_int64 def get_params(self, node): return self.params_type.get_params(self, context=get_context( self.context_name), dtype_int64=self.dtype_int64) def make_node(self, d, x): d = as_gpuarray_variable(d, context_name=self.context_name) x = as_gpuarray_variable(x, context_name=self.context_name) assert d.ndim == 1 assert x.ndim == 1 broadcastable = (False, ) otype = GpuArrayType(dtype='int64' if self.dtype_int64 else 'int32', broadcastable=broadcastable, context_name=self.context_name) return gof.Apply(self, [d, x], [otype()]) def infer_shape(self, node, in_shapes): _, x_shape = in_shapes return [x_shape] def grad(self, inp, grads): return [grad_not_implemented(self, i, inp[i]) for i in range(2)] def gpu_kernels(self, node, name): dtype_d = node.inputs[0].dtype type_d = gpuarray.dtype_to_ctype(dtype_d) dtype_x = node.inputs[1].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) dtype_y = node.outputs[0].dtype type_y = gpuarray.dtype_to_ctype(dtype_y) work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d)) load_d = load_w(dtype_d) work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x)) load_x = load_w(dtype_x) code = """ #include "cluda.h" KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) { d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off); x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off); y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off); ga_size index = threadIdx.x + blockIdx.x * blockDim.x; if (index < lx) { ga_long a = 0; ga_long b = (ga_long)(ld - 1); %(work_d)s minval = %(load_d)s(d[a]); %(work_d)s maxval = %(load_d)s(d[b * stridesD0]); %(work_x)s val = %(load_x)s(x[index * stridesX0]); if (val > maxval) { a = (ga_long)ld; b = (ga_long)ld; } else if (val <= minval) { a = 0; b = 0; } while (b - a > 0) { ga_long h = (b + a) / 2; %(work_d)s t = %(load_d)s(d[h * stridesD0]); if (val < t) { b = h; } else { a = h + 1; } } y[index * stridesY0] = b; } }""" % dict(type_d=type_d, type_x=type_x, type_y=type_y, work_d=work_d, load_d=load_d, work_x=work_x, load_x=load_x, name=name) return [ Kernel(code=code, name="binsearchsorted", params=[ gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE ], flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y), objvar='k_binsearchsorted_' + name) ] def c_headers(self): return [ '<numpy_compat.h>', '<gpuarray_helper.h>', '<gpuarray/types.h>' ] def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def c_code(self, node, name, inp, out, sub): #TODO: fix error msg d, x = inp y, = out fail = sub['fail'] params = sub['params'] typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) kname = self.gpu_kernels(node, name)[0].objvar s = """ int err; size_t dimd = ((size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(d)s))[0]; size_t dimx = ((size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(x)s))[0]; size_t ls = 1024; size_t gs = (dimx / 1024) + 1; size_t out_dims[1] = {dimx}; size_t itemsize_d = 1; size_t itemsize_x = 1; size_t itemsize_y = 1; ssize_t stridesD0 = 1; ssize_t stridesX0 = 1; ssize_t stridesY0 = 1; if (%(y)s == NULL || %(y)s->ga.nd != 1 || %(y)s->ga.dimensions[0] != dimx) { Py_CLEAR(%(y)s); %(y)s = pygpu_zeros(1, out_dims, %(typecode)s, GA_C_ORDER, %(params)s->context, Py_None); } if (%(y)s == NULL) { %(fail)s } itemsize_d = GpuArray_ITEMSIZE(&%(d)s->ga); itemsize_x = GpuArray_ITEMSIZE(&%(x)s->ga); itemsize_y = GpuArray_ITEMSIZE(&%(y)s->ga); stridesD0 = PyGpuArray_STRIDES(%(d)s)[0] / itemsize_d; stridesX0 = PyGpuArray_STRIDES(%(x)s)[0] / itemsize_x; stridesY0 = PyGpuArray_STRIDES(%(y)s)[0] / itemsize_y; err = binsearchsorted_call(1, &gs, &ls, 0, stridesD0, %(d)s->ga.data, %(d)s->ga.offset, stridesX0, %(x)s->ga.data, %(x)s->ga.offset, stridesY0, %(y)s->ga.data, %(y)s->ga.offset, dimx, (ssize_t)dimd); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: kExtract: %%s. n%%lu, m=%%lu.", GpuKernel_error(&%(kname)s, err), (unsigned long)dimx, (unsigned long)dimd); %(fail)s; } """ % locals() return s def c_code_cache_version(self): return (1, )
class QuadraticOpFunc(Op): __props__ = ('a', 'b', 'c') params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type) def __init__(self, a, b, c): self.a = a self.b = b self.c = c def make_node(self, x): x = tensor.as_tensor_variable(x) return Apply(self, [x], [x.type()]) def perform(self, node, inputs, output_storage, coefficients): x = inputs[0] y = output_storage[0] y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c def c_code_cache_version(self): return (1, 5) def c_support_code_apply(self, node, name): float_type = node.inputs[0].type.dtype_specs()[1] return """ /* Computes: x = a*x*x + b*x + c for x in tensor. */ int quadratic_%(name)s(PyArrayObject* tensor, %(float_type)s a, %(float_type)s b, %(float_type)s c) { NpyIter* iterator = NpyIter_New(tensor, NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_KEEPORDER, NPY_NO_CASTING, NULL); if(iterator == NULL) { PyErr_SetString(PyExc_RuntimeError, "Unable to iterate over a tensor for an elemwise operation."); return -1; } NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL); char** data_ptr = NpyIter_GetDataPtrArray(iterator); npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator); npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator); do { char* data = *data_ptr; npy_intp stride = *stride_ptr; npy_intp count = *innersize_ptr; while(count) { %(float_type)s x = *((%(float_type)s*)data); *((%(float_type)s*)data) = a*x*x + b*x + c; data += stride; --count; } } while(get_next(iterator)); NpyIter_Deallocate(iterator); return 0; } """ % { 'name': name, 'float_type': float_type } def c_code(self, node, name, inputs, outputs, sub): return """ %(float_type)s a = (%(float_type)s) (*(npy_float64*) PyArray_GETPTR1(%(coeff)s->a, 0)); // 0-D TensorType. %(float_type)s b = %(coeff)s->b; // Scalar. %(float_type)s c = (%(float_type)s) PyFloat_AsDouble(%(coeff)s->c); // Generic. Py_XDECREF(%(Y)s); %(Y)s = (PyArrayObject*)PyArray_EMPTY(PyArray_NDIM(%(X)s), PyArray_DIMS(%(X)s), PyArray_TYPE(%(X)s), PyArray_IS_F_CONTIGUOUS(%(X)s)); if (PyArray_CopyInto(%(Y)s, %(X)s) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to copy input into output."); %(fail)s }; if (quadratic_%(name)s(%(Y)s, a, b, c) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to compute quadratic function."); %(fail)s } """ % dict(name=name, coeff=sub['params'], fail=sub['fail'], X=inputs[0], Y=outputs[0], float_type=node.inputs[0].type.c_element_type())
def test_params_type_filtering(self): shape_tensor5 = (1, 2, 2, 3, 2) size_tensor5 = shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[ 2] * shape_tensor5[3] * shape_tensor5[4] random_tensor = numpy.random.normal( size=size_tensor5).reshape(shape_tensor5) w = ParamsType(a1=TensorType('int32', (False, False)), a2=TensorType('float64', (False, False, False, False, False)), a3=Generic()) # With a value that does not match the params type. o = Params(w, a1=numpy.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int64'), a2=random_tensor.astype('float32'), a3=2000) # should fail (o.a1 is not int32, o.a2 is not float64) self.assertRaises(TypeError, w.filter, o, True) # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed) self.assertRaises(TypeError, w.filter, o, False, False) # Should pass. w.filter(o, strict=False, allow_downcast=True) # With a value that matches the params type. o1 = Params(w, a1=numpy.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=random_tensor.astype('float64'), a3=2000) # All should pass. w.filter(o1, strict=True) w.filter(o1, strict=False, allow_downcast=False) w.filter(o1, strict=False, allow_downcast=True) # Check values_eq and values_eq_approx. o2 = Params(w, a1=numpy.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=random_tensor.astype('float64'), a3=2000) assert w.values_eq(o1, o2) assert w.values_eq_approx(o1, o2) # Check value_eq_approx. # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx. # So, I just play a little with float values. o3 = Params(w, a1=numpy.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'), a2=(random_tensor.astype('float32') * 10 / 2.2 * 2.19999999999 / 10).astype('float64'), a3=2000.0 - 0.00000000000000001) assert w.values_eq_approx(o1, o3)
class GpuSparseBlockGemv(COp): """ GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more information. This should not be directly called since the interface is subject to change without notice. Use the sandbox.blocksparse.sparse_block_dot() function for a stable interface. """ __props__ = ('inplace', ) params_type = ParamsType(inplace=bool_t, context=gpu_context_type) # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False. def __init__(self, inplace=False): COp.__init__(self, "c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def c_headers(self): return [ '<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>', '<gpuarray_helper.h>' ] def make_node(self, o, W, h, inputIdx, outputIdx): ctx = infer_context_name(o, W, h) o = as_gpuarray_variable(o, ctx) W = as_gpuarray_variable(W, ctx) h = as_gpuarray_variable(h, ctx) inputIdx = as_tensor_variable(inputIdx) outputIdx = as_tensor_variable(outputIdx) assert o.ndim == 3 assert W.ndim == 4 assert h.ndim == 3 assert inputIdx.ndim == 2 assert outputIdx.ndim == 2 assert inputIdx.type.dtype in discrete_dtypes assert outputIdx.type.dtype in discrete_dtypes return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()]) def infer_shape(self, node, input_shapes): return [input_shapes[0]] def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle( (1, 0, 3, 2)), go, outputIdx, inputIdx) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense") ]
class GpuMagmaSVD(GpuMagmaBase): """Computes the svd of a matrix :math:`A` using magma library. .. warning:: Because of implementation constraints, this Op returns outputs in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd` to get them in expected order ``U, S, VT``. """ __props__ = ("full_matrices", "compute_uv") _cop_num_inputs = 1 _cop_num_outputs = 3 check_input = False params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type) def __init__(self, full_matrices=True, compute_uv=True): self.full_matrices = full_matrices self.compute_uv = compute_uv ExternalCOp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_uv: return theano.Apply( self, [A], # return S, U, VT [ GpuArrayType( A.dtype, broadcastable=[False], context_name=ctx_name )(), A.type(), A.type(), ], ) else: return theano.Apply( self, [A], # return only S [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()], ) def prepare_node(self, node, storage_map, compute_map, impl): super().prepare_node(node, storage_map, compute_map, impl) # Check node to prevent eventual errors with old pickled nodes. if self.compute_uv: A, B, C = node.outputs # We expect order: S (vector), U (matrix), VT (matrix) assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, ( "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order." ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, fgraph, node, shapes): (x_shape,) = shapes M, N = x_shape K = tensor.minimum(M, N) s_shape = (K,) if self.compute_uv: u_shape = (M, M) if self.full_matrices else (M, K) vt_shape = (N, N) if self.full_matrices else (K, N) return [s_shape, u_shape, vt_shape] else: return [s_shape]
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): """ Images2Neibs for the GPU. """ params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, ten4, neib_shape, neib_step=None): ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) neib_shape = T.as_tensor_variable(neib_shape) if neib_step is None: neib_step = neib_shape else: neib_step = T.as_tensor_variable(neib_step) assert ten4.ndim == 4 assert neib_shape.ndim == 1 assert neib_step.ndim == 1 assert neib_shape.dtype in T.integer_dtypes assert neib_step.dtype in T.integer_dtypes return Apply(self, [ten4, neib_shape, neib_step], [GpuArrayType(broadcastable=(False, False), dtype=ten4.type.dtype, context_name=ten4.type.context_name)()]) def c_code_cache_version(self): return (13,) def c_headers(self): return ['<numpy_compat.h>', '<gpuarray/types.h>'] def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here. mode_constants = self.BORDER_MODE.c_support_code() kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """ // a version that uses less registers but doesn't work in all cases. %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED){ ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } }""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants) params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """ %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED) { ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } } """ % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants) params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels def c_support_code(self): return """ template <typename T> static T ceil_intdiv(T a, T b) { return (a/b) + ((a % b) ? 1: 0); } """ def c_code(self, node, name, inp, out, sub): err_check = """ if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: *fptr: %%s.", GpuKernel_error(fptr, err)); %(fail)s; } """ % dict(fail=sub['fail']) sync = "" if config.gpuarray.sync: sync = """ err = GpuArray_sync(&%(z)s->ga); %(err_check)s """ % dict(z=out[0], err_check=err_check) # NB: To reduce C code variability: # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node) # For typecode_z, I use ten4->ga.typecode (for same reason as above) return """ int grid_c = -1; int grid_d = -1; size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga); size_t itemsize_z = itemsize_ten4; int typecode_z = %(ten4)s->ga.typecode; { if (PyGpuArray_NDIM(%(ten4)s) != 4) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: pvals wrong rank"); %(fail)s; } if (PyArray_NDIM(%(neib_shape)s) != 1) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: unis wrong rank"); %(fail)s; } if (PyArray_DIMS(%(neib_shape)s)[0] != 2) { PyErr_Format(PyExc_ValueError, "GpuImages2Neibs: neib_shape has to contain two" " elements"); %(fail)s; } const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); if (step_x <=0 || step_y <=0) { PyErr_Format(PyExc_ValueError, "neib_step wrong step ; values <= 0. Got %%lld %%lld.", (long long) step_x, (long long) step_y); %(fail)s; } if (c <=0 || d <=0) { PyErr_Format(PyExc_ValueError, "neib_shape values <= 0. Got %%lld %%lld.", (long long)c, (long long)d); %(fail)s; } if (%(params)s->mode == MODE_WRAP_CENTERED) { if (c%%2!=1 || d%%2!=1){ PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes"); %(fail)s; } if ( PyGpuArray_DIMS(%(ten4)s)[2] < c || PyGpuArray_DIMS(%(ten4)s)[3] < d) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in wrap_centered mode," " don't support image shapes smaller then" " the patch shapes: neib_shape=(%%d,%%d)," " ten4[2:]=[%%d,%%d]", c, d, PyGpuArray_DIMS(%(ten4)s)[2], PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]), (size_t)step_x); grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]), (size_t)step_y); } else if (%(params)s->mode == MODE_VALID) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_IGNORE_BORDERS) { //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_HALF) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); } else if (%(params)s->mode == MODE_FULL) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%ld, neib_step[0]=%%ld and" " ten4.shape[2]=%%ld not consistent", (long int)c, (long int)step_x, (long int)(PyGpuArray_DIMS(%(ten4)s)[2])); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%ld, neib_step[1]=%%ld and" " ten4.shape[3]=%%ld not consistent", (long int)d, (long int)step_y, (long int)(PyGpuArray_DIMS(%(ten4)s)[3])); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y); } else { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode); %(fail)s; } // new dimensions for z const int z_dim1 = c * d; const int z_dim0 = grid_c * grid_d * PyGpuArray_DIMS(%(ten4)s)[1] * PyGpuArray_DIMS(%(ten4)s)[0]; if ((NULL == %(z)s) || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0) || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1)) { Py_XDECREF(%(z)s); size_t dims[2]; dims[0] = z_dim0; dims[1] = z_dim1; %(z)s = pygpu_empty(2, dims, typecode_z, GA_C_ORDER, %(params)s->context, Py_None); if (!%(z)s) { PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" " failed to alloc z output"); %(fail)s; } } } { // NESTED SCOPE const int mode = %(params)s->mode; const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0]; const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1]; const int height = PyGpuArray_DIMS(%(ten4)s)[2]; const int width = PyGpuArray_DIMS(%(ten4)s)[3]; const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); size_t threads_per_block[3] = {d, c, 1}; //get the max threads per blocks size_t max_threads_dim; int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); %(fail)s; } while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--; while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--; //Make bigger block to have better memory access pattern and //a higher core utilisation. for smaller patch size while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 && threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){ threads_per_block[2]++; } int nb_block; if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0) nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]; else nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1; size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1}; GpuKernel *fptr; if(threads_per_block[0]==d && threads_per_block[1]==c){ fptr = &k_multi_warp_less_%(name)s; }else{ fptr = &k_multi_warp_%(name)s; } /* printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]); */ size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4; size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4; size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4; size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4; size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z; size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z; void *kernel_params[] = {(void *)&mode, (void *)&nb_batch, (void *)&nb_stack, (void *)&height, (void *)&width, (void *)&c, (void *)&d, (void *)&step_x, (void *)&step_y, (void *)&grid_c, (void *)&grid_d, (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, (void *)&stride_A3, (void *)%(ten4)s->ga.data, (void *)&%(ten4)s->ga.offset, (void *)&stride_Z0, (void *)&stride_Z1, (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset}; err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params); %(err_check)s %(sync)s } // END NESTED SCOPE """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0], dtype_neib_shape=node.inputs[1].dtype, dtype_neib_step=node.inputs[2].dtype, err_check=err_check, sync=sync, name=name, params=sub['params'], fail=sub['fail']) def perform(self, node, inp, out, params): # Disable the perform method from the CPU version Op.perform(self, node, inp, out, params)
class GpuMagmaSVD(COp): """Computes the svd of a matrix :math:`A` using magma library. .. warning:: Because of implementation constraints, this Op returns outputs in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd` to get them in expected order ``U, S, VT``. """ __props__ = ('full_matrices', 'compute_uv') _cop_num_inputs = 1 _cop_num_outputs = 3 check_input = False params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type) def __init__(self, full_matrices=True, compute_uv=True): self.full_matrices = full_matrices self.compute_uv = compute_uv COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)') def c_headers(self): return [ 'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h', 'gpuarray_helper.h', 'magma.h' ] def c_header_dirs(self): dirs = [os.path.dirname(__file__), pygpu.get_include()] if config.magma.include_path: dirs.append(config.magma.include_path) return dirs def c_libraries(self): return ['magma'] def c_lib_dirs(self): if config.magma.library_path: return [config.magma.library_path] return [] def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) if A.ndim != 2: raise LinAlgError("Matrix rank error") assert A.dtype == 'float32' if self.compute_uv: return theano.Apply( self, [A], # return S, U, VT [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)(), A.type(), A.type() ]) else: return theano.Apply( self, [A], # return only S [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)() ]) def prepare_node(self, node, storage_map, compute_map, impl): # Check node to prevent eventual errors with old pickled nodes. if self.compute_uv: A, B, C = node.outputs # We expect order: S (vector), U (matrix), VT (matrix) assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \ "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \ "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order." def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, node, shapes): x_shape, = shapes M, N = x_shape K = tensor.minimum(M, N) s_shape = (K, ) if self.compute_uv: u_shape = (M, M) if self.full_matrices else (M, K) vt_shape = (N, N) if self.full_matrices else (K, N) return [s_shape, u_shape, vt_shape] else: return [s_shape]
class GpuAveragePoolGrad(CGpuKernelBase): """ Implement the grad of average pooling on the gpu. """ __props__ = ('ignore_border', 'mode', 'ndim') params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type) def __init__(self, ignore_border, mode='max', ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == 'average': mode = 'average_inc_pad' self.mode = mode CGpuKernelBase.__init__(self, ['pool_ave_grad.c'], 'APPLY_SPECIFIC(ave_pool_grad)') assert mode in ('sum', 'average_inc_pad', 'average_exc_pad') assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h'] def c_header_dirs(self): return [os.path.dirname(__file__), pygpu.get_include()] def make_node(self, inp, out_grad, ws, stride=None, pad=None): ctx_name = infer_context_name(inp, out_grad) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert (inp.ndim == nd + 2) out_grad = as_gpuarray_variable(out_grad, ctx_name) assert (out_grad.ndim == nd + 2) assert (out_grad.ndim == inp.ndim) if stride is None: stride = ws if pad is None: pad = (0, ) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.mode == 'average_exc_pad': raise ValueError('Padding must be zero for average_exc_pad') ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in theano.tensor.int_dtypes: raise TypeError('Window shape parameters must be ints.') if stride.dtype not in theano.tensor.int_dtypes: raise TypeError('Stride parameters must be ints.') if pad.dtype not in theano.tensor.int_dtypes: raise TypeError('Padding parameters must be ints.') ws = theano.tensor.cast(ws, 'int64') stride = theano.tensor.cast(stride, 'int64') pad = theano.tensor.cast(pad, 'int64') return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()]) def infer_shape(self, node, in_shapes): return [in_shapes[0]] def grad(self, inp, grads): x, gz, ws, stride, pad = inp ggx, = grads return ([ theano.tensor.zeros_like(x), GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)(ggx, ws, stride, pad) ] + [theano.gradient.DisconnectedType()() for i in inp[2:]]) def connection_pattern(self, node): return [[1], [1], [0], [0], [0]]