def _macros(self, node, name): define_template = "#define %s %s\n" undef_template = "#undef %s\n" define_macros = [] undef_macros = [] rdim = get_scalar_constant_value(node.inputs[2]) vdim = get_scalar_constant_value(node.inputs[3]) define_macros.append(define_template % ("DIM_SPECIFIC(str)", "str##_%d_%d" % (rdim, vdim))) undef_macros.append(undef_template % "DIM_SPECIFIC") consts = { "REF_DIM": str(rdim), "VAL_DIM": str(vdim), "KEY_DIM": str(rdim), "MIN_QUAD_PROBES": str(MIN_QUAD_PROBES), "GID_0": "filt_fakegpu_GID_0", "LID_0": "filt_fakegpu_LID_0", "LDIM_0": "filt_fakegpu_LDIM_0", "KERNEL": "", "GLOBAL_MEM": "" } for k, v in consts.items(): define_macros.append(define_template % (k, v)) undef_macros.append(undef_template % k) return ''.join(define_macros), ''.join(undef_macros)
def make_node(self, ref, values, ref_dim, val_dim, *_hash): assert (values.ndim == 3) ref = as_tensor_variable(ref.astype("float32")) values = as_tensor_variable(values.astype("float32")) ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) if "int" not in str(ref_dim.dtype) or "int" not in str(val_dim.dtype): raise ValueError("ref_dim and val_dim must be integers.") scaled_ref = ref * float(np.sqrt(2 / 3) * (ref_dim + 1)) if len(_hash) == 0: hash_struct = PermutohedralHashTable()(scaled_ref, ref_dim) else: assert (len(_hash) == 6) hash_struct = [as_tensor_variable(v) for v in _hash] # Should we not do this? bcast = [False for _ in range(3)] if val_dim == 1: bcast[0] = True out_type = values.type.clone(broadcastable=bcast) ref_dim = constant(ref_dim, dtype="int32", name="ref_dim") val_dim = constant(val_dim, dtype="int32", name="val_dim") inputs = [ref, values, ref_dim, val_dim] + hash_struct return Apply(self, inputs, [out_type()])
def grad(self, inputs, ograds): ref, values, ref_dim, val_dim = inputs[:4] hash_struct = inputs[4:] ograd = ograds[0] ref_dim = get_scalar_constant_value(ref_dim) val_dim = get_scalar_constant_value(val_dim) def _conv(x): return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct) # Since the kernels are separable and symmetric, the gradient w.r.t. # input is just the same filtering applied to the output grads. grad_i = _conv(ograd) def _gradr(r_i, vals, og, *args): return (og * (_conv(vals * r_i) - r_i * _conv(vals)) + vals * (_conv(og * r_i) - r_i * _conv(og))) grad_r, _ = theano.scan(fn=_gradr, sequences=[ref], non_sequences=[values, ograd] + hash_struct, outputs_info=None) grad_r = grad_r.sum(axis=1, acc_dtype="float32") grads = [DisconnectedType()() for i in range(len(inputs))] grads[0] = grad_r grads[1] = grad_i return grads
def gpu_kernels(self, node, name): dim = get_scalar_constant_value(node.inputs[1]) flags = Kernel.get_flags(node.inputs[0].dtype) def_macros, undef_macros = self._macros(node, name) hsup = (self._hash_support_code() + "\n" + self._lookup_code()) knames = ["build_hash", "dedup", "find_valid"] kcodes = [ "".join( open("%s%s%s.cu" % (os.path.dirname(__file__), os.path.sep, kn)).readlines()) for kn in knames ] kcodes = [ "\n".join([def_macros, hsup, code, undef_macros]) for code in kcodes ] kcodes = ["#include \"cluda.h\"\n" + code for code in kcodes] kparams = ([ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE, SIZE ], [GpuArray, SIZE, GpuArray, SIZE, SIZE], [GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE]) return [ Kernel(code=kcode, name="%s_%d" % (kname, dim), params=kparams, flags=flags) for kcode, kname, kparams in zip(kcodes, knames, kparams) ]
def _macros(self, node, name): define_template = "#define %s %s\n" undef_template = "#undef %s\n" define_macros = [] undef_macros = [] dim = get_scalar_constant_value(node.inputs[1]) define_macros.append(define_template % ("DIM_SPECIFIC(str)", "str##_%d" % dim)) undef_macros.append(undef_template % "DIM_SPECIFIC") consts = { "REF_DIM": str(dim), "KEY_DIM": str(dim), "DR": "%s.f" % str(dim), "INV_DR1": "(1.f / (%s.f+1.f))" % str(dim), "MIN_QUAD_PROBES": str(MIN_QUAD_PROBES), "GID_0": "hash_fakegpu_GID_0", "LID_0": "hash_fakegpu_LID_0", "LDIM_0": "hash_fakegpu_LDIM_0", "KERNEL": "", "GLOBAL_MEM": "" } for k, v in consts.items(): define_macros.append(define_template % (k, v)) undef_macros.append(undef_template % k) return ''.join(define_macros), ''.join(undef_macros)
def compute_bcast(self, dist_params, size): """Compute the broadcast array for this distribution's `TensorType`. Parameters ---------- dist_params: list Distribution parameters. size: int or Sequence (optional) Numpy-like size of the output (i.e. replications). """ shape = self._infer_shape(size, dist_params) # Let's try to do a better job than `_infer_ndim_bcast` when # dimension sizes are symbolic. bcast = [] for s in shape: s_owner = getattr(s, "owner", None) # Get rid of the `Assert`s added by `broadcast_shape` if s_owner and isinstance(s_owner.op, theano.tensor.opt.Assert): s = s_owner.inputs[0] try: s_val = get_scalar_constant_value(s) except NotScalarConstantError: s_val = False bcast += [s_val == 1] return bcast
def infer_shape(self, node, in_shapes): dim = get_scalar_constant_value(node.inputs[1]) point_shp = in_shapes[0] h, w = point_shp[:2] N = h*w cap = N*(dim+1) return [(cap,), (cap, dim), (dim+1, h, w), (dim+1, h, w), (cap,), (1,)]
def gaussian_filter(ref_img, values, kern_std, ref_dim=None, val_dim=None, *_hash): """Applies a high-dimensional Gaussian filter to 'values' with pairwise Gaussian weights based on features in 'ref_img'. Parameters ---------- ref_img : array_like, shape (ref_dim, H, W) The reference image from which to derive the pairwise Gaussian weights (the locations for each image pixel in a high-dimensional space). values : array_like, shape (val_dim, H, W) The image we are going to filter. kern_std : array_like, shape (ref_dim, ) Standard deviation of the Gaussian filter in each dimension. ref_dim : int or None The reference image dimensionality. Must be a known scalar constant. For a color bilateral filter, this is 5: x, y, r, g, b. If None, attempt to infer the dimensionality from the shape of 'ref_img'. val_dim : int or None The image dimensionality (color channels, usually). Must be a known scalar constant. If None, attempt to infer the dimensionality from the shape of 'values'. """ if ref_dim is None: ref_dim = get_scalar_constant_value(ref_img.shape[0]) if val_dim is None: val_dim = get_scalar_constant_value(values.shape[0]) scaled_ref = ref_img / kern_std[:, np.newaxis, np.newaxis] return GaussianFilter()(scaled_ref, values, ref_dim, val_dim, *_hash)
def local_concatenateGrad_mkl(node): if not mkl_available(): return if not isinstance(node.op, Split): return if node.inputs[0].type.ndim != 4: return try: gz, axis, splits, = node.inputs if not isinstance(axis, integer_types): try: axis = int(get_scalar_constant_value(axis)) except NotScalarConstantError: return if isinstance(axis, integer_types): # MKL Concatenate only supports axis=1 if axis != 1: return # Retrieve the inputs to Join op # inp_0 inp_1 inp # | | | # Splits <- MakeVector <- [Subtensor...] <- Shape <- inputs if not isinstance(splits.owner.op, theano.tensor.opt.MakeVector): return tensors = [] for inp_0 in splits.owner.inputs: if not isinstance(inp_0.owner.op, theano.tensor.subtensor.Subtensor): return inp_1 = inp_0.owner.inputs[0] if not isinstance(inp_1.owner.op, theano.compile.ops.Shape): return inp = inp_1.owner.inputs[0] tensors.append(inp) tensors_internal = [U2IConcatenate()(x) for x in tensors] new_inputs = [axis] + tensors_internal z_internal = mkl_concatenate.Concatenate()(*new_inputs) gz_internal = I2UGrad()(z_internal, gz) concatenateGradOut = mkl_concatenate.ConcatenateGrad()(gz_internal, axis, *tensors_internal) gx_user = [U2IGrad()(_x, _gz) for _x, _gz in zip(tensors, concatenateGradOut)] rval = gx_user return rval except Exception as e: msg = ('Failed to apply local opt to Op %s. ' 'Exception message: %s\n') % (node.op, str(e)) _logger.warning(msg) return
def local_0_dot_x(node): if not isinstance(node.op, T.Dot): return False x = node.inputs[0] y = node.inputs[1] replace = False try: if get_scalar_constant_value(x) == 0: replace = True except NotScalarConstantError: pass try: if get_scalar_constant_value(y) == 0: replace = True except NotScalarConstantError: pass if replace: constant_zero = T.constant(0, dtype=node.outputs[0].type.dtype) if x.ndim == 2 and y.ndim == 2: constant_zero = assert_(constant_zero, T.eq(x.shape[1], y.shape[0])) return [T.alloc(constant_zero, x.shape[0], y.shape[1])] elif x.ndim == 1 and y.ndim == 2: constant_zero = assert_(constant_zero, T.eq(x.shape[0], y.shape[0])) return [T.alloc(constant_zero, y.shape[1])] elif x.ndim == 2 and y.ndim == 1: constant_zero = assert_(constant_zero, T.eq(x.shape[1], y.shape[0])) return [T.alloc(constant_zero, x.shape[0])] elif x.ndim == 1 and y.ndim == 1: constant_zero = assert_(constant_zero, T.eq(x.shape[0], y.shape[0])) return [constant_zero] else: _logger.warning("Optimization Warning: " "Optimization theano/opt.py:local_0_dot_x Found " "that it could apply, but was not implemented " "for dot product with these input types:\n" "(%s, %s)", x.type, y.type)
def make_node(self, points, dim): assert (points.ndim == 3) points = as_tensor_variable(points.astype("float32")) dim = get_scalar_constant_value(dim) if "int" not in str(dim.dtype): raise ValueError("dim must be an integer.") dim = constant(dim, dtype="int32", name="dim") entries_type = TensorType("int32", broadcastable=(False, )) keys_type = TensorType("int16", broadcastable=(False, False)) neib_ent_type = TensorType("int32", broadcastable=(False, False, False)) bary_type = TensorType("float32", broadcastable=points.type.broadcastable) valid_entries_type = TensorType("int32", broadcastable=(False, )) n_valid_type = TensorType("int32", broadcastable=(False, )) out_vars = [ entries_type(name="hash_entries"), keys_type(name="hash_keys"), neib_ent_type(name="neighbor_entries"), bary_type(name="barycentric_coords"), valid_entries_type(name="valid_entries"), n_valid_type(name="n_valid") ] # Two sets of entries can't be meaningfully compared without also # having the corresponding keys. Since we can only define per-output # comparisons, we have to hope that any time someone compares two # tables for equality, they will check all outputs. out_vars[0].tag.values_eq_approx = lambda e1, e2: True out_vars[2].tag.values_eq_approx = lambda e1, e2: True # The number of valid entries between two equivalent tables may be # different since it includes duplicates. out_vars[5].tag.values_eq_approx = lambda n1, n2: True def keys_comparison(k1, k2): k1 = [tuple(k) for k in np.asarray(k1)] k2 = [tuple(k) for k in np.asarray(k2)] return set(k1) == set(k2) out_vars[1].tag.values_eq_approx = keys_comparison def valid_entries_comparison(e1, e2): e1 = np.asarray(e1) e2 = np.asarray(e2) return len(np.unique(e1)) == len(np.unique(e2)) out_vars[4].tag.values_eq_approx = valid_entries_comparison return Apply(self, [points, dim], out_vars)
def gpu_kernels(self, node, name): rdim = get_scalar_constant_value(node.inputs[2]) vdim = get_scalar_constant_value(node.inputs[3]) flags = Kernel.get_flags(node.inputs[0].dtype, node.inputs[1].dtype) def_macros, undef_macros = self._macros(node, name) hsup = (GpuHashTable._hash_support_code() + "\n" + GpuHashTable._lookup_code()) knames = ["splat", "blur", "slice"] kcodes = [ "".join( open("%s%s%s.cu" % (os.path.dirname(__file__), os.path.sep, kn)).readlines()) for kn in knames ] kcodes = [ "\n".join([def_macros, hsup, code, undef_macros]) for code in kcodes ] kparams = ([ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE ], [ GpuArray, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE, SIZE ], [ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE ]) return [ Kernel(code=kcode, name="%s_%d_%d" % (kname, rdim, vdim), params=kparams, flags=flags) for kcode, kname, kparams in zip(kcodes, knames, kparams) ]
def scalarconsts_rest(inputs): """Partition a list of variables into two kinds: scalar constants, and the rest.""" consts = [] origconsts = [] nonconsts = [] for i in inputs: try: v = get_scalar_constant_value(i) consts.append(v) origconsts.append(i) except NotScalarConstantError: nonconsts.append(i) return consts, origconsts, nonconsts
def local_max_and_argmax(node): """ If we don't use the argmax, change it to a max only. """ if node.op == T._max_and_argmax: if len(node.outputs[1].clients) == 0: #MaxAndArgmax support variable axis, #but CAReduce support only constant axis. try: axis = get_scalar_constant_value(node.inputs[1]) except NotScalarConstantError: return False new = CAReduce(scal.maximum, axis)(node.inputs[0]) return [new, None]
def shape_dim_i(x, i): #print 'shape keys', shape_of.keys() #print 'args (x, i):', x, i try: return x.data.shape[i] except AttributeError: pass try: return int(get_scalar_constant_value(shape_of[x][i])) except NotScalarConstantError: pass try: return shape_of[x][i].eval() except: return -1 # an unsatisfiable shape
def apply(self, fgraph): did_something = True while did_something: nodelist = fgraph.toposort() did_something = False for node in nodelist: if node.op == T._max_and_argmax: if len(node.outputs[1].clients) == 0: try: axis = get_scalar_constant_value(node.inputs[1]) except NotScalarConstantError: return False new = CAReduce(scal.maximum, axis)(node.inputs[0]) try: fgraph.replace_all_validate(((node.outputs[0], new),), reason=self.__class__.__name__) did_something = True break except InconsistencyError, e: pass
def make_node(self, x, repeats): x = basic.as_tensor_variable(x) repeats = basic.as_tensor_variable(repeats) if repeats.dtype not in tensor.integer_dtypes: raise TypeError("repeats.dtype must be an integer.") # Some dtypes are not supported by numpy's implementation of repeat. # Until another one is available, we should fail at graph construction # time, not wait for execution. ptr_bitwidth = theano.configdefaults.local_bitwidth() if ptr_bitwidth == 64: numpy_unsupported_dtypes = ("uint64",) if ptr_bitwidth == 32: numpy_unsupported_dtypes = ("uint32", "int64", "uint64") if repeats.dtype in numpy_unsupported_dtypes: raise TypeError( ( "dtypes %s are not supported by numpy.repeat " "for the 'repeats' parameter, " % str(numpy_unsupported_dtypes) ), repeats.dtype, ) if self.axis is None: broadcastable = [False] else: try: const_reps = basic.get_scalar_constant_value(repeats) except basic.NotScalarConstantError: const_reps = None if const_reps == 1: broadcastable = x.broadcastable else: broadcastable = list(x.broadcastable) broadcastable[self.axis] = False out_type = theano.tensor.TensorType(x.dtype, broadcastable) return theano.Apply(self, [x, repeats], [out_type()])
def apply(self, fgraph): did_something = True while did_something: nodelist = fgraph.toposort() did_something = False for node in nodelist: if node.op == T._max_and_argmax: if len(node.outputs[1].clients) == 0: try: axis = get_scalar_constant_value(node.inputs[1]) except NotScalarConstantError: return False new = CAReduce(scal.maximum, axis)(node.inputs[0]) try: fgraph.replace_all_validate( ((node.outputs[0], new),), reason=self.__class__.__name__) did_something = True break except InconsistencyError, e: pass
def isNaN_or_Inf_or_None(x): isNone = x is None try: isNaN = numpy.isnan(x) isInf = numpy.isinf(x) isStr = isinstance(x, string_types) except Exception: isNaN = False isInf = False isStr = False if not isNaN and not isInf: try: val = get_scalar_constant_value(x) isInf = numpy.isinf(val) isNaN = numpy.isnan(val) except Exception: isNaN = False isInf = False if isinstance(x, gof.Constant) and isinstance(x.data, string_types): isStr = True else: isStr = False return isNone or isNaN or isInf or isStr
def local_max_and_argmax(node): """ If we don't use the argmax, change it to a max only. """ if node.op == T._max_and_argmax: if len(node.outputs[1].clients) == 0: # MaxAndArgmax support variable axis, # but CAReduce support only constant axis. if node.inputs[1].data is None: axis = None else: try: axis = get_scalar_constant_value(node.inputs[1]) except NotScalarConstantError: axis = node.inputs[1] if not isinstance(axis, T.TensorConstant): return False axis = axis.data new = CAReduce(scal.maximum, axis)(node.inputs[0]) return [new, None] if len(node.outputs[0].clients) == 0: return [None, T._argmax(node.inputs[0], node.inputs[1])]
def local_concatenate_mkl(node): if not mkl_available(): return if not isinstance(node.op, Join): return if node.inputs[1].type.ndim != 4: return try: axis, tensors = node.inputs[0], node.inputs[1:] if not isinstance(axis, integer_types): try: axis = int(get_scalar_constant_value(axis)) except NotScalarConstantError: return if isinstance(axis, integer_types): # MKL Concatenate only supports axis=1 if axis != 1: return tensors_internal = [U2IConcatenate()(x) for x in tensors] new_inputs = [axis] + tensors_internal concatenateOut = mkl_concatenate.Concatenate()(*new_inputs) z_user = I2U()(concatenateOut) rval = z_user return [rval] except Exception as e: msg = ('Failed to apply local opt to Op %s. ' 'Exception message: %s\n') % (node.op, str(e)) _logger.warning(msg) return
def c_code(self, node, name, inputs, outputs, sub): points = inputs[0] entries, keys, neib_ents, barycentric, valid_entries, n_valid = outputs dim = get_scalar_constant_value(node.inputs[1]) fail = sub["fail"] code = """ npy_intp point_dims[3]; npy_intp entries_dim[1]; npy_intp keys_dims[2]; npy_intp neib_ents_dims[3]; npy_intp barycentric_dims[3]; npy_intp valid_entries_dim[1]; npy_intp n_valid_dim[1]; n_valid_dim[0] = 1; point_dims[0] = PyArray_DIMS(%(points)s)[0]; point_dims[1] = PyArray_DIMS(%(points)s)[1]; point_dims[2] = PyArray_DIMS(%(points)s)[2]; npy_intp N = point_dims[1] * point_dims[2]; npy_intp cap = N*(point_dims[0]+1); PyArrayObject* pcontig = NULL; bool should_decref_pcontig = false; if(point_dims[0] != %(dim)s) { PyErr_Format(PyExc_ValueError, "hashtable error: incorrect input dim 0.\\nExpected %(dim)s got %%d", point_dims[0]); %(fail)s; } if(PyArray_TYPE(%(points)s) != NPY_FLOAT) { PyErr_Format(PyExc_ValueError, "hashtable error: incorrect dtype for points."); %(fail)s; } entries_dim[0] = cap; keys_dims[0] = cap; keys_dims[1] = point_dims[0]; neib_ents_dims[0] = point_dims[0]+1; neib_ents_dims[1] = point_dims[1]; neib_ents_dims[2] = point_dims[2]; barycentric_dims[0] = point_dims[0]+1; barycentric_dims[1] = point_dims[1]; barycentric_dims[2] = point_dims[2]; valid_entries_dim[0] = cap; if(!valid_output_ptr(%(entries)s, NPY_INT, 1, entries_dim)) { Py_XDECREF(%(entries)s); %(entries)s = (PyArrayObject*)PyArray_EMPTY(1, entries_dim, NPY_INT, 0); } if(!valid_output_ptr(%(keys)s, NPY_SHORT, 2, keys_dims)) { Py_XDECREF(%(keys)s); %(keys)s = (PyArrayObject*)PyArray_ZEROS(2, keys_dims, NPY_SHORT, 0); } if(!valid_output_ptr(%(neib_ents)s, NPY_INT, 3, neib_ents_dims)) { Py_XDECREF(%(neib_ents)s); %(neib_ents)s = (PyArrayObject*)PyArray_ZEROS(3, neib_ents_dims, NPY_INT, 0); } if(!valid_output_ptr(%(barycentric)s, NPY_FLOAT, 3, barycentric_dims)) { Py_XDECREF(%(barycentric)s); %(barycentric)s = (PyArrayObject*)PyArray_ZEROS(3, barycentric_dims, NPY_FLOAT, 0); } if(!valid_output_ptr(%(valid_entries)s, NPY_INT, 1, valid_entries_dim)) { Py_XDECREF(%(valid_entries)s); %(valid_entries)s = (PyArrayObject*)PyArray_ZEROS(1, valid_entries_dim, NPY_INT, 0); } if(!valid_output_ptr(%(n_valid)s, NPY_INT, 1, n_valid_dim)) { Py_XDECREF(%(n_valid)s); %(n_valid)s = (PyArrayObject*)PyArray_ZEROS(1, n_valid_dim, NPY_INT, 0); } else { PyArray_FillWithScalar(%(n_valid)s, PyLong_FromLong(0)); } if (!(%(entries)s && %(keys)s && %(neib_ents)s && %(barycentric)s && %(valid_entries)s)) { PyErr_Format(PyExc_MemoryError, "error building hash table: failed to allocate output storage."); %(fail)s; } if (!PyArray_IS_C_CONTIGUOUS(%(points)s)) { should_decref_pcontig = true; } pcontig = PyArray_GETCONTIGUOUS(%(points)s); PyArray_FillWithScalar(%(entries)s, PyLong_FromLong(-1)); #pragma omp parallel for for(int i=0; i<N; ++i) { hash_fakegpu_GID_0 = i; build_hash_%(dim)s( (float*)PyArray_DATA(%(points)s), 0, (int*)PyArray_DATA(%(entries)s), 0, (short*)PyArray_DATA(%(keys)s), 0, (int*)PyArray_DATA(%(neib_ents)s), 0, (float*)PyArray_DATA(%(barycentric)s), 0, cap, N); } #pragma omp parallel for for(int i=0; i<cap; ++i) { hash_fakegpu_GID_0 = i; dedup_%(dim)s( (int*)PyArray_DATA(%(entries)s), 0, (short*)PyArray_DATA(%(keys)s), 0, cap); } #pragma omp parallel for for(int i=0; i<cap; ++i) { hash_fakegpu_GID_0 = i; find_valid_%(dim)s( (int*)PyArray_DATA(%(entries)s), 0, (int*)PyArray_DATA(%(valid_entries)s), 0, (int*)PyArray_DATA(%(n_valid)s), 0, cap); } if (should_decref_pcontig) { Py_DECREF(pcontig); } """ return code % locals()
def c_code(self, node, name, inputs, outputs, sub): values = inputs[1] entries, keys, neib_ents, barycentric, valid_entries, nv = inputs[4:] output = outputs[0] rdim = get_scalar_constant_value(node.inputs[2]) vdim = get_scalar_constant_value(node.inputs[3]) fail = sub["fail"] inplace = "1" if self.inplace else "0" code = """ npy_intp val_dims[3]; npy_intp tmp_val_dims[2]; npy_intp output_dims[3]; val_dims[0] = PyArray_DIMS(%(values)s)[0]; val_dims[1] = PyArray_DIMS(%(values)s)[1]; val_dims[2] = PyArray_DIMS(%(values)s)[2]; size_t N = val_dims[1] * val_dims[2]; size_t cap = N*(%(rdim)s+1); size_t ls_N, gs_N, ls_valid, gs_valid; int nv = *((int*)PyArray_DATA(%(nv)s)); PyArrayObject* tmp_vals_1 = NULL; PyArrayObject* tmp_vals_2 = NULL; PyArrayObject* tmp_vptr_1 = NULL; PyArrayObject* tmp_vptr_2 = NULL; PyArrayObject* tmp_swap = NULL; PyArrayObject* vcontig = NULL; bool should_decref_vcontig = false; if(val_dims[0] != %(vdim)s) { PyErr_Format(PyExc_ValueError, "blur error: bad input shape 0.\\nExpected %(vdim)s, got %%d", val_dims[0]); %(fail)s; } if(val_dims[1] != PyArray_DIMS(%(barycentric)s)[1] || val_dims[2] != PyArray_DIMS(%(barycentric)s)[2]) { PyErr_Format(PyExc_ValueError, "blur error: bad input h/w.\\nExpected (%%d, %%d), got (%%d, %%d)", val_dims[1], val_dims[2]); %(fail)s; } tmp_val_dims[0] = cap; tmp_val_dims[1] = val_dims[0]; output_dims[0] = val_dims[0]; output_dims[1] = val_dims[1]; output_dims[2] = val_dims[2]; tmp_vals_1 = (PyArrayObject*)PyArray_ZEROS(2, tmp_val_dims, NPY_FLOAT, 0); tmp_vals_2 = (PyArrayObject*)PyArray_ZEROS(2, tmp_val_dims, NPY_FLOAT, 0); if (!tmp_vals_1 || !tmp_vals_2) { PyErr_Format(PyExc_RuntimeError, "error allocating temporary filtering storage."); %(fail)s; } tmp_vptr_1 = tmp_vals_1; tmp_vptr_2 = tmp_vals_2; if(%(inplace)s) { Py_XDECREF(%(output)s); %(output)s = %(values)s; Py_INCREF(%(output)s); } else if(!valid_output_ptr(%(output)s, NPY_FLOAT, 3, output_dims)) { Py_XDECREF(%(output)s); %(output)s = (PyArrayObject*)PyArray_ZEROS(3, output_dims, NPY_FLOAT, 0); } if (!%(output)s) { PyErr_Format(PyExc_MemoryError, "error performing gaussian blur: failed to allocate output storage."); %(fail)s; } if (!PyArray_IS_C_CONTIGUOUS(%(values)s)) { should_decref_vcontig = true; } vcontig = PyArray_GETCONTIGUOUS(%(values)s); #pragma omp parallel for for(int i=0; i<N; ++i) { filt_fakegpu_GID_0 = i; splat_%(rdim)s_%(vdim)s( (float*)PyArray_DATA(vcontig), 0, (float*)PyArray_DATA(%(barycentric)s), 0, (int*)PyArray_DATA(%(entries)s), 0, (int*)PyArray_DATA(%(neib_ents)s), 0, (float*)PyArray_DATA(tmp_vals_1), N); } for(int ax=0; ax<%(rdim)s+1; ++ax) { #pragma omp parallel for for(int i=0; i<nv; ++i) { filt_fakegpu_GID_0 = i; blur_%(rdim)s_%(vdim)s( (float*)PyArray_DATA(tmp_vptr_2), (int*)PyArray_DATA(%(entries)s), 0, (int*)PyArray_DATA(%(valid_entries)s), 0, (short*)PyArray_DATA(%(keys)s), 0, (float*)PyArray_DATA(tmp_vptr_1), cap, nv, ax); } tmp_swap = tmp_vptr_1; tmp_vptr_1 = tmp_vptr_2; tmp_vptr_2 = tmp_swap; } #pragma omp parallel for for(int i=0; i<N; ++i) { filt_fakegpu_GID_0 = i; slice_%(rdim)s_%(vdim)s( (float*)PyArray_DATA(%(output)s), 0, (float*)PyArray_DATA(%(barycentric)s), 0, (int*)PyArray_DATA(%(entries)s), 0, (int*)PyArray_DATA(%(neib_ents)s), 0, (float*)PyArray_DATA(tmp_vptr_2), N); } if (should_decref_vcontig) { Py_DECREF(vcontig); } """ return code % locals()
def c_code(self, node, name, inputs, outputs, sub): points = inputs[0] entries, keys, neib_ents, barycentric, valid_entries, n_valid = outputs dim = get_scalar_constant_value(node.inputs[1]) fail = sub["fail"] ctx = sub["params"] sync = bool(theano.config.gpuarray.sync) kname_build = "k_build_hash_%d" % dim kname_dedup = "k_dedup_%d" % dim kname_fve = "k_find_valid_%d" % dim code = """ int err = GA_NO_ERROR; size_t point_dims[3]; size_t entries_dim[1]; size_t keys_dims[2]; size_t neib_ents_dims[3]; size_t barycentric_dims[3]; size_t valid_entries_dim[1]; size_t n_valid_dim[1]; n_valid_dim[0] = 1; point_dims[0] = PyGpuArray_DIMS(%(points)s)[0]; point_dims[1] = PyGpuArray_DIMS(%(points)s)[1]; point_dims[2] = PyGpuArray_DIMS(%(points)s)[2]; size_t N = point_dims[1] * point_dims[2]; size_t cap = N*(point_dims[0]+1); size_t ls_N, gs_N, ls_cap, gs_cap; if(point_dims[0] != %(dim)s) { PyErr_Format(PyExc_ValueError, "hashtable error: incorrect input dim 0.\\nExpected %(dim)s got %%d", point_dims[0]); %(fail)s; } entries_dim[0] = cap; keys_dims[0] = cap; keys_dims[1] = point_dims[0]; neib_ents_dims[0] = point_dims[0]+1; neib_ents_dims[1] = point_dims[1]; neib_ents_dims[2] = point_dims[2]; barycentric_dims[0] = point_dims[0]+1; barycentric_dims[1] = point_dims[1]; barycentric_dims[2] = point_dims[2]; valid_entries_dim[0] = cap; if(!valid_output_ptr(%(entries)s, GA_INT, 1, entries_dim)) { Py_XDECREF(%(entries)s); %(entries)s = pygpu_empty(1, entries_dim, GA_INT, GA_C_ORDER, %(ctx)s, Py_None); } if(!valid_output_ptr(%(keys)s, GA_SHORT, 2, keys_dims)) { Py_XDECREF(%(keys)s); %(keys)s = pygpu_zeros(2, keys_dims, GA_SHORT, GA_C_ORDER, %(ctx)s, Py_None); } if(!valid_output_ptr(%(neib_ents)s, GA_INT, 3, neib_ents_dims)) { Py_XDECREF(%(neib_ents)s); %(neib_ents)s = pygpu_zeros(3, neib_ents_dims, GA_INT, GA_C_ORDER, %(ctx)s, Py_None); } if(!valid_output_ptr(%(barycentric)s, GA_FLOAT, 3, barycentric_dims)) { Py_XDECREF(%(barycentric)s); %(barycentric)s = pygpu_zeros(3, barycentric_dims, GA_FLOAT, GA_C_ORDER, %(ctx)s, Py_None); } if(!valid_output_ptr(%(valid_entries)s, GA_INT, 1, valid_entries_dim)) { Py_XDECREF(%(valid_entries)s); %(valid_entries)s = pygpu_zeros(1, valid_entries_dim, GA_INT, GA_C_ORDER, %(ctx)s, Py_None); } if(!valid_output_ptr(%(n_valid)s, GA_INT, 1, n_valid_dim)) { Py_XDECREF(%(n_valid)s); %(n_valid)s = pygpu_zeros(1, n_valid_dim, GA_INT, GA_C_ORDER, %(ctx)s, Py_None); } else { GpuArray_memset(&%(n_valid)s->ga, 0); } if (!(%(entries)s && %(keys)s && %(neib_ents)s && %(barycentric)s && %(valid_entries)s)) { PyErr_Format(PyExc_MemoryError, "error building hash table: failed to allocate output storage."); %(fail)s; } GpuArray_memset(&%(entries)s->ga, -1); gs_N = ls_N = 0; GpuKernel_sched(&%(kname_build)s, N, &gs_N, &ls_N); gs_N = N / ls_N; if (ls_N*gs_N < N) { ++gs_N; } err = build_hash_%(dim)s_call(1, &gs_N, &ls_N, 0, %(points)s->ga.data, %(points)s->ga.offset / sizeof(float), %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short), %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int), %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float), cap, N); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error building hash table:\\n%%s.\\n", GpuKernel_error(&%(kname_build)s, err)); %(fail)s; } GpuArray_sync(&%(entries)s->ga); GpuArray_sync(&%(keys)s->ga); gs_cap = ls_cap = 0; GpuKernel_sched(&%(kname_dedup)s, cap, &gs_cap, &ls_cap); gs_cap = cap / ls_cap; if (ls_cap*gs_cap < cap) { ++gs_cap; } err = dedup_%(dim)s_call(1, &gs_cap, &ls_cap, 0, %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short), cap); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error cleaning hash table:\\n%%s.\\n", GpuKernel_error(&%(kname_dedup)s, err)); %(fail)s; } GpuArray_sync(&%(entries)s->ga); GpuArray_sync(&%(keys)s->ga); err = find_valid_%(dim)s_call(1, &gs_cap, &ls_cap, 0, %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(valid_entries)s->ga.data, %(valid_entries)s->ga.offset / sizeof(int), %(n_valid)s->ga.data, %(n_valid)s->ga.offset / sizeof(int), cap); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error counting valid hash entries:\\n%%s.\\n", GpuKernel_error(&%(kname_fve)s, err)); %(fail)s; } GpuArray_sync(&%(entries)s->ga); GpuArray_sync(&%(keys)s->ga); GpuArray_sync(&%(neib_ents)s->ga); GpuArray_sync(&%(barycentric)s->ga); GpuArray_sync(&%(n_valid)s->ga); """ return code % locals()
def c_code(self, node, name, inputs, outputs, sub): values = inputs[1] entries, keys, neib_ents, barycentric, valid_entries, nv = inputs[4:] output = outputs[0] rdim = get_scalar_constant_value(node.inputs[2]) vdim = get_scalar_constant_value(node.inputs[3]) fail = sub["fail"] ctx = sub["params"] kname_splat = "k_splat_%d_%d" % (rdim, vdim) kname_blur = "k_blur_%d_%d" % (rdim, vdim) kname_slice = "k_slice_%d_%d" % (rdim, vdim) inplace = "1" if self.inplace else "0" code = """ int err = GA_NO_ERROR; size_t val_dims[3]; size_t tmp_val_dims[2]; size_t output_dims[3]; val_dims[0] = PyGpuArray_DIMS(%(values)s)[0]; val_dims[1] = PyGpuArray_DIMS(%(values)s)[1]; val_dims[2] = PyGpuArray_DIMS(%(values)s)[2]; size_t N = val_dims[1] * val_dims[2]; size_t cap = N*(%(rdim)s+1); size_t ls_N, gs_N, ls_valid, gs_valid; int nv; GpuArray_read((void*)(&nv), sizeof(int), &%(nv)s->ga); GpuArray tmp_vals_1, tmp_vals_2; GpuArray* tmp_vptr_1 = &tmp_vals_1; GpuArray* tmp_vptr_2 = &tmp_vals_2; GpuArray* tmp_swap = NULL; if(val_dims[0] != %(vdim)s) { PyErr_Format(PyExc_ValueError, "blur error: bad input shape 0.\\nExpected %(vdim)s, got %%d", val_dims[0]); %(fail)s; } if(val_dims[1] != PyGpuArray_DIMS(%(barycentric)s)[1] || val_dims[2] != PyGpuArray_DIMS(%(barycentric)s)[2]) { PyErr_Format(PyExc_ValueError, "blur error: bad input h/w.\\nExpected (%%d, %%d), got (%%d, %%d)", val_dims[1], val_dims[2]); %(fail)s; } tmp_val_dims[0] = cap; tmp_val_dims[1] = val_dims[0]; output_dims[0] = val_dims[0]; output_dims[1] = val_dims[1]; output_dims[2] = val_dims[2]; if(%(inplace)s) { Py_XDECREF(%(output)s); %(output)s = %(values)s; Py_INCREF(%(output)s); } else if(!valid_output_ptr(%(output)s, GA_FLOAT, 3, output_dims)) { Py_XDECREF(%(output)s); %(output)s = pygpu_zeros(3, output_dims, GA_FLOAT, GA_C_ORDER, %(ctx)s, Py_None); } err = GpuArray_zeros(&tmp_vals_1, %(ctx)s->ctx, GA_FLOAT, 2, tmp_val_dims, GA_C_ORDER); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error allocating memory:\\n%%s.\\n", GpuArray_error(&tmp_vals_1, err)); %(fail)s; } err = GpuArray_zeros(&tmp_vals_2, %(ctx)s->ctx, GA_FLOAT, 2, tmp_val_dims, GA_C_ORDER); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error allocating memory:\\n%%s.\\n", GpuArray_error(&tmp_vals_2, err)); %(fail)s; } if (!%(output)s) { PyErr_Format(PyExc_MemoryError, "error performing gaussian blur: failed to allocate output storage."); %(fail)s; } gs_N = ls_N = 0; GpuKernel_sched(&%(kname_splat)s, N, &gs_N, &ls_N); gs_N = N / ls_N; if (ls_N*gs_N < N) { ++gs_N; } err = splat_%(rdim)s_%(vdim)s_call(1, &gs_N, &ls_N, 0, %(values)s->ga.data, %(values)s->ga.offset / sizeof(float), %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float), %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int), tmp_vals_1.data, N); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error splatting:\\n%%s.\\n", GpuKernel_error(&%(kname_splat)s, err)); %(fail)s; } GpuArray_sync(&tmp_vals_1); gs_valid = ls_valid = 0; GpuKernel_sched(&%(kname_blur)s, nv, &gs_valid, &ls_valid); gs_valid = nv / ls_valid; if (ls_valid*gs_valid < nv) { ++gs_valid; } for(int ax=0; ax<%(rdim)s+1; ++ax) { err = blur_%(rdim)s_%(vdim)s_call(1, &gs_valid, &ls_valid, 0, tmp_vptr_2->data, %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(valid_entries)s->ga.data, %(valid_entries)s->ga.offset / sizeof(int), %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short), tmp_vptr_1->data, cap, nv, ax); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error blurring:\\n%%s.\\n", GpuKernel_error(&%(kname_blur)s, err)); %(fail)s; } GpuArray_sync(tmp_vptr_2); tmp_swap = tmp_vptr_1; tmp_vptr_1 = tmp_vptr_2; tmp_vptr_2 = tmp_swap; } err = slice_%(rdim)s_%(vdim)s_call(1, &gs_N, &ls_N, 0, %(output)s->ga.data, %(output)s->ga.offset / sizeof(float), %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float), %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int), %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int), tmp_vptr_2->data, N); if(err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error slicing:\\n%%s.\\n", GpuKernel_error(&%(kname_slice)s, err)); %(fail)s; } GpuArray_sync(&%(output)s->ga); GpuArray_clear(&tmp_vals_1); GpuArray_clear(&tmp_vals_2); """ return code % locals()