class GpuMagmaMatrixInverse(GpuMagmaBase): """Computes the inverse of a matrix :math:`A` using magma library.""" __props__ = ("inplace",) check_input = False params_type = ParamsType(inplace=bool_t, context=gpu_context_type) def __init__(self, inplace=False): COp.__init__(self, ["c_code/magma_inv.c"], "APPLY_SPECIFIC(magma_inv)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def clone_inplace(self): return self.__class__(inplace=True) def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") return aesara.Apply(self, [A], [A.type()]) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, node, shapes): return shapes
class GpuMagmaEigh(GpuMagmaBase): """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma library. Parameters ---------- UPLO : Specifies whether the calculation is done with the lower triangular part of matrix (`L`, default) or the upper triangular part (`U`). compute_v : If `True`, computes eigenvalues and eigenvectors (`True`, default). If `False`, computes only eigenvalues of matrix. """ __props__ = ("lower", "compute_v") _cop_num_inputs = 1 _cop_num_outputs = 2 check_input = False params_type = ParamsType(lower=bool_t, compute_v=bool_t, context=gpu_context_type) def __init__(self, UPLO="L", compute_v=True): assert UPLO in ["L", "U"] self.lower = UPLO == "L" self.compute_v = compute_v COp.__init__(self, ["c_code/magma_eigh.c"], "APPLY_SPECIFIC(magma_eigh)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_v: return aesara.Apply( self, [A], # return D, V [ GpuArrayType( A.dtype, broadcastable=[False], context_name=ctx_name )(), A.type(), ], ) else: return aesara.Apply( self, [A], # return D [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()], ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context)
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase): """Computes the qr decomposition of a matrix :math:`A` using magma library. Parameters ---------- complete : If False, returns only ``R``. .. warning:: Because of implementation constraints, this Op returns outputs in order ``R, Q``. Use :func:`aesara.gpuarray.linalg.gpu_qr` to get them in expected order ``Q, R``. """ __props__ = ("complete",) _cop_num_inputs = 1 _cop_num_outputs = 2 check_input = False params_type = ParamsType(complete=bool_t, context=gpu_context_type) def __init__(self, complete=True): self.complete = complete COp.__init__(self, ["c_code/magma_qr.c"], "APPLY_SPECIFIC(magma_qr)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.complete: return aesara.Apply( self, [A], # return R, Q [A.type(), A.type()], ) else: return aesara.Apply( self, [A], # return R [A.type()], ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context)
def test_hash_and_eq_params_type(self): w1 = ParamsType( a1=TensorType("int64", (False, False)), a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) w2 = ParamsType( a1=TensorType("int64", (False, False)), a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) assert w1 == w2 assert not (w1 != w2) assert hash(w1) == hash(w2) assert w1.name == w2.name # Changing attributes names only. w2 = ParamsType( a1=TensorType("int64", (False, False)), other_name=TensorType( "int64", (False, True, False, False, True) ), # a2 -> other_name a3=Generic(), ) assert w1 != w2 # Changing attributes types only. w2 = ParamsType( a1=TensorType("int64", (False, False)), a2=Generic(), # changing class a3=Generic(), ) assert w1 != w2 # Changing attributes types characteristics only. w2 = ParamsType( a1=TensorType("int64", (False, True)), # changing broadcasting a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) assert w1 != w2
class GpuSparseBlockOuter(COp): """ GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more information. This op should not be called directly since its interface is subject to change without notice. It is involved in the gradient of GpuSparseBlockGemv. The gradient is not implemented. """ __props__ = ("inplace", ) params_type = ParamsType(inplace=bool_t, context=gpu_context_type) def __init__(self, inplace=False): COp.__init__(self, ["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, o, x, y, xIdx, yIdx, alpha=None): ctx = infer_context_name(o, x, y) one = tensor.constant(np.asarray(1.0, dtype="float32")) o = as_gpuarray_variable(o, ctx) x = as_gpuarray_variable(x, ctx) y = as_gpuarray_variable(y, ctx) xIdx = as_tensor_variable(xIdx) yIdx = as_tensor_variable(yIdx) if alpha is None: alpha = one return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()]) def infer_shape(self, node, input_shapes): return [input_shapes[0]] def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def c_headers(self): return [ "<gpuarray/buffer_blas.h>", "<gpuarray/buffer.h>", "<gpuarray_helper.h>", ]
class mrg_uniform_base(Op): # TODO : need description for class, parameter __props__ = ("output_type", "inplace") params_type = ParamsType( inplace=bool_t, # following params will come from self.output_type. # NB: As output object may not be allocated in C code, # we can not be sure to get these properties from output. # So, we should better get them as params from self.output_type. ndim=int_t, otypenum=int_t, otype_is_float32=bool_t, ) def __init__(self, output_type, inplace=False): Op.__init__(self) self.output_type = output_type self.inplace = inplace if inplace: self.destroy_map = {0: [0]} self.warned_numpy_version = False # These attributes (used as params) are created as properties # to make them available even for old pickled objects, e.g. # when testing old interface or when using FAST_COMPILE mode. ndim = property(lambda self: self.output_type.ndim) otypenum = property(lambda self: np.dtype(self.output_type.dtype).num) otype_is_float32 = property( lambda self: self.output_type.dtype == "float32") def __str__(self): if self.inplace: s = "inplace" else: s = "no_inplace" return self.__class__.__name__ + "{{{},{}}}".format( self.output_type, s) def grad(self, inputs, ograd): return [ gradient.grad_undefined( self, k, inp, "No gradient defined through " "random sampling op") for k, inp in enumerate(inputs) ] def R_op(self, inputs, eval_points): return [None for i in eval_points]
class GpuEye(CGpuKernelBase, Op): """ Eye for GPU. """ __props__ = ("dtype", "context_name") params_type = ParamsType(typecode=int_t, context=gpu_context_type) def __init__(self, dtype=None, context_name=None): if dtype is None: dtype = config.floatX self.dtype = dtype self.context_name = context_name CGpuKernelBase.__init__(self, ["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)") def get_params(self, node): pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray") return self.params_type.get_params( typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype), context=get_context(self.context_name), ) def c_headers(self): return ["<gpuarray/types.h>", "<gpuarray/kernel.h>"] def make_node(self, n, m): n = tensor.as_tensor_variable(n) m = tensor.as_tensor_variable(m) assert n.ndim == 0 assert m.ndim == 0 otype = GpuArrayType( dtype=self.dtype, broadcastable=(False, False), context_name=self.context_name, ) return Apply(self, [n, m], [otype()]) def infer_shape(self, node, in_shapes): out_shape = [node.inputs[0], node.inputs[1]] return [out_shape] def grad(self, inp, grads): return [grad_undefined(self, i, inp[i]) for i in range(2)]
def test_params_type_with_enums(self): # Test that we fail if we create a params type with common enum names inside different enum types. try: ParamsType(enum1=EnumList("A", "B", "C"), enum2=EnumList("A", "B", "F")) except AttributeError: pass else: raise Exception( "ParamsType should fail with common enum names inside different enum types." ) # Test that we fail if we create a params type with common names in both aliases and constants. try: ParamsType( enum1=EnumList(("A", "a"), ("B", "b")), enum2=EnumList(("ONE", "a"), ("TWO", "two")), ) except AttributeError: ParamsType( enum1=EnumList(("A", "a"), ("B", "b")), enum2=EnumList(("ONE", "one"), ("TWO", "two")), ) else: raise Exception( "ParamsType should fail when there are aliases with same names as some constants." ) # Test that we can access enum values through wrapper directly. w = ParamsType( enum1=EnumList("A", ("B", "beta"), "C"), enum2=EnumList(("D", "delta"), "E", "F"), ) assert w.A == 0 and w.B == 1 and w.C == 2 assert w.D == 0 and w.E == 1 and w.F == 2 # Test constants access through aliases. assert w.enum_from_alias("beta") == w.B assert w.enum_from_alias("delta") == w.D assert ( w.enum_from_alias("C") == w.C ) # C is not an alias, so it should return a constant named C. # Test that other regular wrapper attributes are still available. assert len(w.fields) == len(w.types) == w.length assert w.name
class QuadraticCOpFunc(COp): __props__ = ("a", "b", "c") params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type) def __init__(self, a, b, c): super().__init__( "c_code/test_quadratic_function.c", "APPLY_SPECIFIC(compute_quadratic)" ) self.a = a self.b = b self.c = c def make_node(self, x): x = tensor.as_tensor_variable(x) return Apply(self, [x], [x.type()]) def perform(self, node, inputs, output_storage, coefficients): x = inputs[0] y = output_storage[0] y[0] = coefficients.a * (x ** 2) + coefficients.b * x + coefficients.c
class GpuSparseBlockGemv(COp): """ GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more information. This should not be directly called since the interface is subject to change without notice. Use the sandbox.blocksparse.sparse_block_dot() function for a stable interface. """ __props__ = ("inplace", ) params_type = ParamsType(inplace=bool_t, context=gpu_context_type) # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False. def __init__(self, inplace=False): COp.__init__(self, "c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def c_headers(self): return [ "<gpuarray/buffer_blas.h>", "<gpuarray/buffer.h>", "<gpuarray_helper.h>", ] def make_node(self, o, W, h, inputIdx, outputIdx): ctx = infer_context_name(o, W, h) o = as_gpuarray_variable(o, ctx) W = as_gpuarray_variable(W, ctx) h = as_gpuarray_variable(h, ctx) inputIdx = as_tensor_variable(inputIdx) outputIdx = as_tensor_variable(outputIdx) assert o.ndim == 3 assert W.ndim == 4 assert h.ndim == 3 assert inputIdx.ndim == 2 assert outputIdx.ndim == 2 assert inputIdx.type.dtype in discrete_dtypes assert outputIdx.type.dtype in discrete_dtypes return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()]) def infer_shape(self, node, input_shapes): return [input_shapes[0]] def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle( (1, 0, 3, 2)), go, outputIdx, inputIdx) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"), ]
def test_params_type_filtering(self): shape_tensor5 = (1, 2, 2, 3, 2) size_tensor5 = ( shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[2] * shape_tensor5[3] * shape_tensor5[4] ) random_tensor = np.random.normal(size=size_tensor5).reshape(shape_tensor5) w = ParamsType( a1=TensorType("int32", (False, False)), a2=TensorType("float64", (False, False, False, False, False)), a3=Generic(), ) # With a value that does not match the params type. o = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int64"), a2=random_tensor.astype("float32"), a3=2000, ) # should fail (o.a1 is not int32, o.a2 is not float64) with pytest.raises(TypeError): w.filter(o, True) # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed) with pytest.raises(TypeError): w.filter(o, False, False) # Should pass. w.filter(o, strict=False, allow_downcast=True) # With a value that matches the params type. o1 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=random_tensor.astype("float64"), a3=2000, ) # All should pass. w.filter(o1, strict=True) w.filter(o1, strict=False, allow_downcast=False) w.filter(o1, strict=False, allow_downcast=True) # Check values_eq and values_eq_approx. o2 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=random_tensor.astype("float64"), a3=2000, ) assert w.values_eq(o1, o2) assert w.values_eq_approx(o1, o2) # Check value_eq_approx. # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx. # So, I just play a little with float values. o3 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=(random_tensor.astype("float32") * 10 / 2.2 * 2.19999999999 / 10).astype( "float64" ), a3=2000.0 - 0.00000000000000001, ) assert w.values_eq_approx(o1, o3)
class GpuMaxPoolRop(CGpuKernelBase): """ Implements the R-operator for the downsample operation. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border self.mode = mode CGpuKernelBase.__init__( self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)" ) assert mode == "max" assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, eval_point, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 eval_point = as_gpuarray_variable(eval_point, ctx_name) assert eval_point.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in aesara.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in aesara.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in aesara.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()]) def infer_shape(self, node, in_shapes): ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]] shp = Pool.out_shape( in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim ) return [shp]
class QuadraticOpFunc(Op): __props__ = ("a", "b", "c") params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type) def __init__(self, a, b, c): self.a = a self.b = b self.c = c def make_node(self, x): x = tensor.as_tensor_variable(x) return Apply(self, [x], [x.type()]) def perform(self, node, inputs, output_storage, coefficients): x = inputs[0] y = output_storage[0] y[0] = coefficients.a * (x ** 2) + coefficients.b * x + coefficients.c def c_code_cache_version(self): return (1, 5) def c_support_code_apply(self, node, name): float_type = node.inputs[0].type.dtype_specs()[1] return """ /* Computes: x = a*x*x + b*x + c for x in tensor. */ int quadratic_%(name)s(PyArrayObject* tensor, %(float_type)s a, %(float_type)s b, %(float_type)s c) { NpyIter* iterator = NpyIter_New(tensor, NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_KEEPORDER, NPY_NO_CASTING, NULL); if(iterator == NULL) { PyErr_SetString(PyExc_RuntimeError, "Unable to iterate over a tensor for an elemwise operation."); return -1; } NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL); char** data_ptr = NpyIter_GetDataPtrArray(iterator); npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator); npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator); do { char* data = *data_ptr; npy_intp stride = *stride_ptr; npy_intp count = *innersize_ptr; while(count) { %(float_type)s x = *((%(float_type)s*)data); *((%(float_type)s*)data) = a*x*x + b*x + c; data += stride; --count; } } while(get_next(iterator)); NpyIter_Deallocate(iterator); return 0; } """ % { "name": name, "float_type": float_type, } def c_code(self, node, name, inputs, outputs, sub): return """ %(float_type)s a = (%(float_type)s) (*(npy_float64*) PyArray_GETPTR1(%(coeff)s->a, 0)); // 0-D TensorType. %(float_type)s b = %(coeff)s->b; // Scalar. %(float_type)s c = (%(float_type)s) PyFloat_AsDouble(%(coeff)s->c); // Generic. Py_XDECREF(%(Y)s); %(Y)s = (PyArrayObject*)PyArray_EMPTY(PyArray_NDIM(%(X)s), PyArray_DIMS(%(X)s), PyArray_TYPE(%(X)s), PyArray_IS_F_CONTIGUOUS(%(X)s)); if (PyArray_CopyInto(%(Y)s, %(X)s) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to copy input into output."); %(fail)s }; if (quadratic_%(name)s(%(Y)s, a, b, c) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to compute quadratic function."); %(fail)s } """ % dict( name=name, coeff=sub["params"], fail=sub["fail"], X=inputs[0], Y=outputs[0], float_type=node.inputs[0].type.c_element_type(), )
def test_hash_and_eq_params(self): wp1 = ParamsType( a=Generic(), array=TensorType("int64", (False,)), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) wp2 = ParamsType( a=Generic(), array=TensorType("int64", (False,)), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w1 = Params( wp1, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 == w2 assert not (w1 != w2) assert hash(w1) == hash(w2) # Changing attributes names only (a -> other_name). wp2_other = ParamsType( other_name=Generic(), array=TensorType("int64", (False,)), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w2 = Params( wp2_other, other_name=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing attributes values only (now a=2). w2 = Params( wp2, a=2, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing NumPy array values (5 -> -5). w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, -5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2
class GpuPool(CGpuKernelBase): """ Implement the max and average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType( ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type ) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)") assert PoolingMode_t.has_alias(self.mode) assert self.ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) inp = as_gpuarray_variable(inp, ctx_name) nd = self.ndim assert inp.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in aesara.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in aesara.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in aesara.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, ws, stride, pad], [inp.type()]) def infer_shape(self, node, in_shapes): ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]] shp = Pool.out_shape( in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim ) return [shp] def grad(self, inp, grads): img, ws, stride, pad = inp (grad,) = grads grad = gpu_contiguous(grad) disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]] if self.mode == "max": out = self(img, ws, stride, pad) g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)( img, out, grad, ws, stride, pad ) return [g_out] + disc else: g_out = GpuAveragePoolGrad( ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode )(img, grad, ws, stride, pad) return [g_out] + disc def connection_pattern(self, node): return [[1], [0], [0], [0]] def R_op(self, inputs, eval_points): if self.mode != "max": # Rop for average or sum is simply pooling evaluated at eval point eval_inputs = [eval_points[0]] + inputs[1:] return [self(*eval_inputs)] # R_op can receive None as eval_points. # That mean there is no diferientiable path through that input # If this imply that you cannot compute some outputs, # return None for those. if eval_points[0] is None: return [None] z = self(*inputs) x, ws, stride, pad = inputs return [ GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)( x, z, eval_points[0], ws, stride, pad ) ]
class GpuCumOp(GpuKernelBase, Op): """ Parameters ---------- axis Can not be None. If you want the array flattened, do it before. """ SUPPORTED_NDIMS = 3 __props__ = ("axis", "mode") params_type = ParamsType(axis=scalar.int32, context=gpu_context_type) def __init__(self, axis, mode="add"): assert axis is not None self.axis = int(axis) self.mode = mode def __eq__(self, other): if type(other) != type(self): return False return self.axis == other.axis and self.mode == other.mode def __hash__(self): return hash(self.axis) ^ hash(self.mode) def c_code_cache_version(self): return (7, ) def c_headers(self): return [ "<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>" ] def c_header_dirs(self): return [gpuarray_helper_inc_dir()] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, x): assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp" context_name = infer_context_name(x) x = as_gpuarray_variable(x, context_name) if x.ndim > GpuCumOp.SUPPORTED_NDIMS: raise NotImplementedError("Only cum op on 1D, 2D and\ 3D arrays are supported right now!") if self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError("axis(={}) out of bounds".format(self.axis)) return Apply(self, [x], [x.type()]) def gpu_kernels(self, node, nodename): kernels = [] # cumadd kname = "k_cumadd" op = {"mul": "*", "add": "+"}[self.mode] k_var = "k_cumadd_" + nodename dtype_x = node.inputs[0].dtype flags = Kernel.get_flags(dtype_x) code = ("""#include "cluda.h" KERNEL void %(kname)s(float* input, ga_size input_offset, float* output, ga_size output_offset, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, const int offsetY, const int offsetZ, const int beforeLastElementIdx, const int lastElementIdx){ input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z; int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z; int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input; int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output; int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast]; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "intc", "intc", "intc", "intc", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # blockCumOp kname = "k_blockCumOp" k_var = "k_blockCumOp_" + nodename params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", gpuarray.GpuArray, gpuarray.SIZE, ] code = ("""#include "cluda.h" // helper functions WITHIN_KERNEL void k_reductionPhase(float* partialCumOp) { // Traverse down from leaves to root building partial sums at internal nodes in the tree. for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index < blockDim.x*2) { partialCumOp[index] %(op)s= partialCumOp[index - stride]; } } } WITHIN_KERNEL void k_fetchData(float* partialCumOp, float* input, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; partialCumOp[threadIdx.x*2] = input[idx_even]; partialCumOp[threadIdx.x*2 + 1] = input[idx_odd]; } WITHIN_KERNEL void k_reversePhase(float* partialCumOp) { // Traverse back up the tree building the scan from the partial sums for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index + stride < blockDim.x*2) { partialCumOp[index + stride] %(op)s= partialCumOp[index]; } } } WITHIN_KERNEL void k_pushData(float* partialCumOp, float* output, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { local_barrier(); // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] = partialCumOp[threadIdx.x*2]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1]; } KERNEL void k_blockCumOp(float* input, ga_size input_offset, float* output, ga_size output_offset, size_t nbElementsPerCumOp, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, int offsetY, int offsetZ, float* blockSum, ga_size blockSum_offset) { input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case. int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; // Check if current thread has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) { return; } extern __shared__ float partialCumOp[]; // Load data in shared memory k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ); // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree). // The tree is sweeped from the leaves to the root and from the root to the leaves. // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf k_reductionPhase(partialCumOp); k_reversePhase(partialCumOp); // Write the final output to global memory k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (blockSum != NULL){ if (threadIdx.x == blockDim.x - 1) { blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1]; } } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # k_finalCumOp kname = "k_finalCumOp" k_var = "k_finalCumOp_" + nodename code = ("""#include "cluda.h" KERNEL void k_finalCumOp(float* output, ga_size output_offset, float* blockSum, ga_size blockSum_offset, size_t nbElementsPerCumOp, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; // Check if current has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) return; int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ]; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] %(op)s= currentBlockSum; output[idx_odd] %(op)s= currentBlockSum; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels def c_code(self, node, nodename, inp, out, sub): if node.inputs[0].type.context.kind != b"cuda": raise NotImplementedError("cuda only") return """ const size_t* shape = PyGpuArray_DIMS(%(x)s); bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s); int axis = %(params)s->axis; if (axis < 0) { // Convert negative axis to positive axis. axis += PyGpuArray_NDIM(%(x)s); } if (aesara_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) { %(fail)s; } { // Namespace for kernel calls // size_t max_threads_dim0; size_t max_grid_size1; size_t max_grid_size2; int err; err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2"); %(fail)s; } if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){ %(fail)s; } } """ % dict( x=inp[0], z=out[0], nodename=nodename, fail=sub["fail"], params=sub["params"], ) def c_support_code_struct(self, node, nodename): code = (""" int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) { size_t shape[3] = { 1, 1, 1 }; ssize_t inputStrides_x; ssize_t inputStrides_y; ssize_t inputStrides_z; ssize_t outputStrides_x; ssize_t outputStrides_y; ssize_t outputStrides_z; switch (PyGpuArray_NDIM(input)) { case 1: shape[0] = PyGpuArray_DIMS(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); break; case 2: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); break; case 3: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; shape[2] = PyGpuArray_DIMS(input)[2]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float); break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } if (shape[axis] <= 1) { int err = pygpu_move(output, input); return err; } // Perform cum op on array of even size. size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2); // Determine how many elements can be processed in one block. size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2; // Determine how many blocks are needed in total. size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op. size_t dimGridY; // Nb. of independent cum ops (width). size_t dimGridZ; // Nb. of independent cum ops (height). ssize_t tmp; switch (axis) { case 0: dimGridY = shape[1]; dimGridZ = shape[2]; break; case 1: dimGridY = shape[0]; dimGridZ = shape[2]; tmp = inputStrides_x; inputStrides_x = inputStrides_y; inputStrides_y = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_y; outputStrides_y = tmp; break; case 2: dimGridY = shape[1]; dimGridZ = shape[0]; tmp = inputStrides_x; inputStrides_x = inputStrides_z; inputStrides_z = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_z; outputStrides_z = tmp; break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ }; PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode, GA_C_ORDER, input->context, Py_None); if (deviceBlockSum == NULL){ return -1; } // Perform `maxGridY`*`maxGridZ` cum ops in parallel. for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){ size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY); for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ); size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t sharedBytes = (2*dimBlockX) * sizeof(float); int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed"); return -1; } if (dimGridX > 1) { // Do a cum op over the blockSum (recursive). if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){ Py_DECREF(deviceBlockSum); return -1; } // Since there are more than one block (i.e. `dimGridX > 1`) // report partial cum ops of previous blocks to subsequents ones. size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed"); return -1; } } // If shape[axis] is odd, the last element is compute manually if (shape[axis] != nbElementsPerCumOp){ size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {1, 1, 1}; int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "cumadd call failed"); return -1; } } } } Py_XDECREF(deviceBlockSum); return 0; } """ % locals()) return super().c_support_code_struct(node, nodename) + code
class GpuMagmaSVD(GpuMagmaBase): """Computes the svd of a matrix :math:`A` using magma library. .. warning:: Because of implementation constraints, this Op returns outputs in order ``S, U, VT``. Use :func:`aesara.gpuarray.linalg.gpu_svd` to get them in expected order ``U, S, VT``. """ __props__ = ("full_matrices", "compute_uv") _cop_num_inputs = 1 _cop_num_outputs = 3 check_input = False params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type) def __init__(self, full_matrices=True, compute_uv=True): self.full_matrices = full_matrices self.compute_uv = compute_uv COp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_uv: return aesara.Apply( self, [A], # return S, U, VT [ GpuArrayType( A.dtype, broadcastable=[False], context_name=ctx_name )(), A.type(), A.type(), ], ) else: return aesara.Apply( self, [A], # return only S [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()], ) def prepare_node(self, node, storage_map, compute_map, impl): super().prepare_node(node, storage_map, compute_map, impl) # Check node to prevent eventual errors with old pickled nodes. if self.compute_uv: A, B, C = node.outputs # We expect order: S (vector), U (matrix), VT (matrix) assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, ( "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order." ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, node, shapes): (x_shape,) = shapes M, N = x_shape K = tensor.minimum(M, N) s_shape = (K,) if self.compute_uv: u_shape = (M, M) if self.full_matrices else (M, K) vt_shape = (N, N) if self.full_matrices else (K, N) return [s_shape, u_shape, vt_shape] else: return [s_shape]
def params_type(self): return ParamsType(i=aesara.scalar.basic.int64)
class CumOp(Op): # See function cumsum/cumprod for docstring __props__ = ("axis", "mode") check_input = False params_type = ParamsType(c_axis=int_t, mode=EnumList(("MODE_ADD", "add"), ("MODE_MUL", "mul"))) def __init__(self, axis=None, mode="add"): if mode not in ("add", "mul"): raise ValueError('{}: Unknown mode "{}"'.format( type(self).__name__, mode)) self.axis = axis self.mode = mode c_axis = property(lambda self: np.MAXDIMS if self.axis is None else self.axis) def make_node(self, x): x = basic.as_tensor_variable(x) out_type = x.type() if self.axis is None: out_type = aesara.tensor.vector(dtype=x.dtype) # Flatten elif self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError("axis(={}) out of bounds".format(self.axis)) return aesara.Apply(self, [x], [out_type]) def perform(self, node, inputs, output_storage, params): x = inputs[0] z = output_storage[0] if self.mode == "add": z[0] = np.cumsum(x, axis=self.axis) else: z[0] = np.cumprod(x, axis=self.axis) def grad(self, inputs, output_gradients): (x, ) = inputs (gi, ) = output_gradients if self.axis is None: if self.mode == "add": return [cumsum(gi[::-1])[::-1].reshape(x.shape)] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [cumsum((fx * gi)[::-1])[::-1].reshape(x.shape) / x] else: raise NotImplementedError( '%s: unknown gradient for mode "%s"' % (type(self).__name__, self.mode)) reverse_slicing = [slice(None, None, None)] * gi.ndim reverse_slicing[self.axis] = slice(None, None, -1) reverse_slicing = tuple(reverse_slicing) # We need to reverse the gradients along ``self.axis``, # compute cumsum, then reverse again if self.mode == "add": return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [ cumsum( (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x ] else: raise NotImplementedError( '{}: unknown gradient for mode "{}"'.format( type(self).__name__, self.mode)) def infer_shape(self, node, shapes): if self.axis is None: return [(basic.prod(shapes[0]), )] # Flatten return shapes def c_code(self, node, name, inames, onames, sub): (x, ) = inames (z, ) = onames axis = self.axis fail = sub["fail"] params = sub["params"] code = (""" int axis = %(params)s->c_axis; if (axis == 0 && PyArray_NDIM(%(x)s) == 1) axis = NPY_MAXDIMS; npy_intp shape[1] = { PyArray_SIZE(%(x)s) }; if(axis == NPY_MAXDIMS && !(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0])) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s)); } else if(axis != NPY_MAXDIMS && !(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)))) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE(%(x)s)); } if (!%(z)s) %(fail)s; { PyObject * t = NULL; if(%(params)s->mode == MODE_ADD) t = PyArray_CumSum( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); else if(%(params)s->mode == MODE_MUL) t = PyArray_CumProd( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); if (!t){ %(fail)s; } // Because PyArray_CumSum/CumProd returns a newly created reference on t. Py_XDECREF(t); } """ % locals()) return code def c_code_cache_version(self): return (8, ) def __str__(self): return "{}{{{}, {}}}".format(self.__class__.__name__, self.axis, self.mode)
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): """ Images2Neibs for the GPU. """ params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, ten4, neib_shape, neib_step=None): ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) neib_shape = tt.as_tensor_variable(neib_shape) if neib_step is None: neib_step = neib_shape else: neib_step = tt.as_tensor_variable(neib_step) assert ten4.ndim == 4 assert neib_shape.ndim == 1 assert neib_step.ndim == 1 assert neib_shape.dtype in tt.integer_dtypes assert neib_step.dtype in tt.integer_dtypes return Apply( self, [ten4, neib_shape, neib_step], [ GpuArrayType( broadcastable=(False, False), dtype=ten4.type.dtype, context_name=ten4.type.context_name, )() ], ) def c_code_cache_version(self): return (14, ) def c_headers(self): return ["<numpy_compat.h>", "<gpuarray/types.h>"] def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here. mode_constants = self.BORDER_MODE.c_support_code() kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """#include "cluda.h" // a version that uses less registers but doesn't work in all cases. %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED){ ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } }""" % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """#include "cluda.h" %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED) { ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } } """ % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels def c_support_code(self): return """ template <typename T> static T ceil_intdiv(T a, T b) { return (a/b) + ((a % b) ? 1: 0); } """ def c_code(self, node, name, inp, out, sub): err_check = """ if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: *fptr: %%s.", GpuKernel_error(fptr, err)); %(fail)s; } """ % dict(fail=sub["fail"]) # NB: To reduce C code variability: # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node) # For typecode_z, I use ten4->ga.typecode (for same reason as above) return """ int grid_c = -1; int grid_d = -1; size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga); size_t itemsize_z = itemsize_ten4; int typecode_z = %(ten4)s->ga.typecode; { if (PyGpuArray_NDIM(%(ten4)s) != 4) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: pvals wrong rank"); %(fail)s; } if (PyArray_NDIM(%(neib_shape)s) != 1) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: unis wrong rank"); %(fail)s; } if (PyArray_DIMS(%(neib_shape)s)[0] != 2) { PyErr_Format(PyExc_ValueError, "GpuImages2Neibs: neib_shape has to contain two" " elements"); %(fail)s; } const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); if (step_x <=0 || step_y <=0) { PyErr_Format(PyExc_ValueError, "neib_step wrong step ; values <= 0. Got %%lld %%lld.", (long long) step_x, (long long) step_y); %(fail)s; } if (c <=0 || d <=0) { PyErr_Format(PyExc_ValueError, "neib_shape values <= 0. Got %%lld %%lld.", (long long)c, (long long)d); %(fail)s; } if (%(params)s->mode == MODE_WRAP_CENTERED) { if (c%%2!=1 || d%%2!=1){ PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes"); %(fail)s; } if ( PyGpuArray_DIMS(%(ten4)s)[2] < c || PyGpuArray_DIMS(%(ten4)s)[3] < d) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in wrap_centered mode," " don't support image shapes smaller then" " the patch shapes: neib_shape=(%%d,%%d)," " ten4[2:]=[%%d,%%d]", c, d, PyGpuArray_DIMS(%(ten4)s)[2], PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]), (size_t)step_x); grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]), (size_t)step_y); } else if (%(params)s->mode == MODE_VALID) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_IGNORE_BORDERS) { //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_HALF) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); } else if (%(params)s->mode == MODE_FULL) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%ld, neib_step[0]=%%ld and" " ten4.shape[2]=%%ld not consistent", (long int)c, (long int)step_x, (long int)(PyGpuArray_DIMS(%(ten4)s)[2])); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%ld, neib_step[1]=%%ld and" " ten4.shape[3]=%%ld not consistent", (long int)d, (long int)step_y, (long int)(PyGpuArray_DIMS(%(ten4)s)[3])); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y); } else { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode); %(fail)s; } // new dimensions for z const int z_dim1 = c * d; const int z_dim0 = grid_c * grid_d * PyGpuArray_DIMS(%(ten4)s)[1] * PyGpuArray_DIMS(%(ten4)s)[0]; if ((NULL == %(z)s) || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0) || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1)) { Py_XDECREF(%(z)s); size_t dims[2]; dims[0] = z_dim0; dims[1] = z_dim1; %(z)s = pygpu_empty(2, dims, typecode_z, GA_C_ORDER, %(params)s->context, Py_None); if (!%(z)s) { PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" " failed to alloc z output"); %(fail)s; } } } { // NESTED SCOPE const int mode = %(params)s->mode; const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0]; const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1]; const int height = PyGpuArray_DIMS(%(ten4)s)[2]; const int width = PyGpuArray_DIMS(%(ten4)s)[3]; const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); size_t threads_per_block[3] = {d, c, 1}; //get the max threads per blocks size_t max_threads_dim; int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); %(fail)s; } while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--; while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--; //Make bigger block to have better memory access pattern and //a higher core utilisation. for smaller patch size while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 && threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){ threads_per_block[2]++; } int nb_block; if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0) nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]; else nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1; size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1}; GpuKernel *fptr; if(threads_per_block[0]==d && threads_per_block[1]==c){ fptr = &k_multi_warp_less_%(name)s; }else{ fptr = &k_multi_warp_%(name)s; } /* printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]); */ size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4; size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4; size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4; size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4; size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z; size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z; void *kernel_params[] = {(void *)&mode, (void *)&nb_batch, (void *)&nb_stack, (void *)&height, (void *)&width, (void *)&c, (void *)&d, (void *)&step_x, (void *)&step_y, (void *)&grid_c, (void *)&grid_d, (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, (void *)&stride_A3, (void *)%(ten4)s->ga.data, (void *)&%(ten4)s->ga.offset, (void *)&stride_Z0, (void *)&stride_Z1, (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset}; err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params); %(err_check)s } // END NESTED SCOPE """ % dict( ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0], dtype_neib_shape=node.inputs[1].dtype, dtype_neib_step=node.inputs[2].dtype, err_check=err_check, name=name, params=sub["params"], fail=sub["fail"], ) def perform(self, node, inp, out, params): # Disable the perform method from the CPU version Op.perform(self, node, inp, out, params)
class GpuAveragePoolGrad(CGpuKernelBase): """ Implement the grad of average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__( self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)" ) assert mode in ("sum", "average_inc_pad", "average_exc_pad") assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, out_grad, ws, stride=None, pad=None): ctx_name = infer_context_name(inp, out_grad) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 out_grad = as_gpuarray_variable(out_grad, ctx_name) assert out_grad.ndim == nd + 2 assert out_grad.ndim == inp.ndim if stride is None: stride = ws if pad is None: pad = (0,) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.mode == "average_exc_pad": raise ValueError("Padding must be zero for average_exc_pad") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in aesara.tensor.int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in aesara.tensor.int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in aesara.tensor.int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()]) def infer_shape(self, node, in_shapes): return [in_shapes[0]] def grad(self, inp, grads): x, gz, ws, stride, pad = inp (ggx,) = grads return [ aesara.tensor.zeros_like(x), GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)( ggx, ws, stride, pad ), ] + [aesara.gradient.DisconnectedType()() for i in inp[2:]] def connection_pattern(self, node): return [[1], [1], [0], [0], [0]]