class GpuMagmaMatrixInverse(GpuMagmaBase): """Computes the inverse of a matrix :math:`A` using magma library.""" __props__ = ("inplace", ) check_input = False params_type = ParamsType(inplace=bool_t, context=gpu_context_type) def __init__(self, inplace=False): ExternalCOp.__init__(self, ["c_code/magma_inv.c"], "APPLY_SPECIFIC(magma_inv)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def clone_inplace(self): return self.__class__(inplace=True) def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") return Apply(self, [A], [A.type()]) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, fgraph, node, shapes): return shapes
class CGemv(BaseBLAS, Gemv): params_type = ParamsType( inplace=bool_t, ) def __init__(self, inplace): super().__init__(inplace) def c_code(self, node, name, inp, out, sub): y, alpha, A, x, beta = inp (z,) = out code = gemv_c_code( y, A, x, z, alpha, beta, fail=sub["fail"], force_init_beta=check_force_gemv_init(), params=sub["params"], ) return code def c_code_cache_version(self): return (14, blas_header_version(), check_force_gemv_init())
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase): """Computes the qr decomposition of a matrix :math:`A` using magma library. Parameters ---------- complete : If False, returns only ``R``. .. warning:: Because of implementation constraints, this Op returns outputs in order ``R, Q``. Use :func:`aesara.gpuarray.linalg.gpu_qr` to get them in expected order ``Q, R``. """ __props__ = ("complete", ) _cop_num_inputs = 1 _cop_num_outputs = 2 check_input = False params_type = ParamsType(complete=bool_t, context=gpu_context_type) def __init__(self, complete=True): self.complete = complete ExternalCOp.__init__(self, ["c_code/magma_qr.c"], "APPLY_SPECIFIC(magma_qr)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.complete: return Apply( self, [A], # return R, Q [A.type(), A.type()], ) else: return Apply( self, [A], # return R [A.type()], ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context)
class CGer(BaseBLAS, Ger): params_type = ParamsType( destructive=bool_t, ) def c_code(self, node, name, inp, out, sub): A, a, x, y = inp (Z,) = out code = ger_c_code(A, a, x, y, Z, fail=sub["fail"], params=sub["params"]) return code def c_code_cache_version(self): return (11, blas_header_version())
def test_hash_and_eq_params_type(self): w1 = ParamsType( a1=TensorType("int64", (False, False)), a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) w2 = ParamsType( a1=TensorType("int64", (False, False)), a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) assert w1 == w2 assert not (w1 != w2) assert hash(w1) == hash(w2) assert w1.name == w2.name # Changing attributes names only. w2 = ParamsType( a1=TensorType("int64", (False, False)), other_name=TensorType( "int64", (False, True, False, False, True)), # a2 -> other_name a3=Generic(), ) assert w1 != w2 # Changing attributes types only. w2 = ParamsType( a1=TensorType("int64", (False, False)), a2=Generic(), # changing class a3=Generic(), ) assert w1 != w2 # Changing attributes types characteristics only. w2 = ParamsType( a1=TensorType("int64", (False, True)), # changing broadcasting a2=TensorType("int64", (False, True, False, False, True)), a3=Generic(), ) assert w1 != w2
class GpuEye(CGpuKernelBase): """Eye for GPU. This is an implementation to test that `CGpuKernelBase` works and also to use as an example in the docs. It is not used for user graphs. """ __props__ = ("dtype", "context_name") params_type = ParamsType(typecode=int_t, context=gpu_context_type) def __init__(self, dtype=None, context_name=None): if dtype is None: dtype = config.floatX self.dtype = dtype self.context_name = context_name super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)") def get_params(self, node): pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray") return self.params_type.get_params( typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype), context=get_context(self.context_name), ) def c_headers(self, **kwargs): return ["<gpuarray/types.h>", "<gpuarray/kernel.h>"] def make_node(self, n, m): n = aet.as_tensor_variable(n) m = aet.as_tensor_variable(m) assert n.ndim == 0 assert m.ndim == 0 otype = GpuArrayType( dtype=self.dtype, broadcastable=(False, False), context_name=self.context_name, ) return Apply(self, [n, m], [otype()]) def infer_shape(self, fgraph, node, in_shapes): out_shape = [node.inputs[0], node.inputs[1]] return [out_shape] def grad(self, inp, grads): return [grad_undefined(self, i, inp[i]) for i in range(2)]
class GpuSparseBlockOuter(_NoPythonExternalCOp): """ GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more information. This op should not be called directly since its interface is subject to change without notice. It is involved in the gradient of GpuSparseBlockGemv. The gradient is not implemented. """ __props__ = ("inplace", ) params_type = ParamsType(inplace=bool_t, context=gpu_context_type) def __init__(self, inplace=False): super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, o, x, y, xIdx, yIdx, alpha=None): ctx = infer_context_name(o, x, y) one = aet.constant(np.asarray(1.0, dtype="float32")) o = as_gpuarray_variable(o, ctx) x = as_gpuarray_variable(x, ctx) y = as_gpuarray_variable(y, ctx) xIdx = as_tensor_variable(xIdx) yIdx = as_tensor_variable(yIdx) if alpha is None: alpha = one return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()]) def infer_shape(self, fgraph, node, input_shapes): return [input_shapes[0]] def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir()] def c_headers(self, **kwargs): return [ "<gpuarray/buffer_blas.h>", "<gpuarray/buffer.h>", "<gpuarray_helper.h>", ]
class mrg_uniform_base(Op): # TODO : need description for class, parameter __props__ = ("output_type", "inplace") params_type = ParamsType( inplace=bool_t, # following params will come from self.output_type. # NB: As output object may not be allocated in C code, # we can not be sure to get these properties from output. # So, we should better get them as params from self.output_type. ndim=int_t, otypenum=int_t, otype_is_float32=bool_t, ) def __init__(self, output_type, inplace=False): Op.__init__(self) self.output_type = output_type self.inplace = inplace if inplace: self.destroy_map = {0: [0]} self.warned_numpy_version = False # These attributes (used as params) are created as properties # to make them available even for old pickled objects, e.g. # when testing old interface or when using FAST_COMPILE mode. ndim = property(lambda self: self.output_type.ndim) otypenum = property(lambda self: np.dtype(self.output_type.dtype).num) otype_is_float32 = property( lambda self: self.output_type.dtype == "float32") def __str__(self): if self.inplace: s = "inplace" else: s = "no_inplace" return self.__class__.__name__ + f"{{{self.output_type},{s}}}" def grad(self, inputs, ograd): return [ gradient.grad_undefined( self, k, inp, "No gradient defined through " "random sampling op") for k, inp in enumerate(inputs) ] def R_op(self, inputs, eval_points): return [None for i in eval_points]
def test_params_type_with_enums(self): # Test that we fail if we create a params type with common enum names inside different enum types. try: ParamsType(enum1=EnumList("A", "B", "C"), enum2=EnumList("A", "B", "F")) except AttributeError: pass else: raise Exception( "ParamsType should fail with common enum names inside different enum types." ) # Test that we fail if we create a params type with common names in both aliases and constants. try: ParamsType( enum1=EnumList(("A", "a"), ("B", "b")), enum2=EnumList(("ONE", "a"), ("TWO", "two")), ) except AttributeError: ParamsType( enum1=EnumList(("A", "a"), ("B", "b")), enum2=EnumList(("ONE", "one"), ("TWO", "two")), ) else: raise Exception( "ParamsType should fail when there are aliases with same names as some constants." ) # Test that we can access enum values through wrapper directly. w = ParamsType( enum1=EnumList("A", ("B", "beta"), "C"), enum2=EnumList(("D", "delta"), "E", "F"), ) assert w.A == 0 and w.B == 1 and w.C == 2 assert w.D == 0 and w.E == 1 and w.F == 2 # Test constants access through aliases. assert w.enum_from_alias("beta") == w.B assert w.enum_from_alias("delta") == w.D assert (w.enum_from_alias("C") == w.C ) # C is not an alias, so it should return a constant named C. # Test that other regular wrapper attributes are still available. assert len(w.fields) == len(w.types) == w.length assert w.name
class QuadraticCOpFunc(ExternalCOp): __props__ = ("a", "b", "c") params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type) def __init__(self, a, b, c): super().__init__("c_code/test_quadratic_function.c", "APPLY_SPECIFIC(compute_quadratic)") self.a = a self.b = b self.c = c def make_node(self, x): x = at.as_tensor_variable(x) return Apply(self, [x], [x.type()]) def perform(self, node, inputs, output_storage, coefficients): x = inputs[0] y = output_storage[0] y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c
class Reshape(COp): """Perform a reshape operation of the input x to the new shape shp. The number of dimensions to which to reshape to (ndim) must be known at graph build time. """ view_map = {0: [0]} # output 0 is potentially aliased to inputs [0] _f16_ok = True check_input = False __props__ = ("ndim", ) params_type = ParamsType(ndim=int32) # name does not participate because it doesn't affect computations def __init__(self, ndim, name=None): self.ndim = int(ndim) if ndim < 0: raise ValueError( "The output dimensions after reshape must be 0 or greater") assert name is None, "name attribute for Reshape has been deprecated" def __str__(self): return f"{self.__class__.__name__}{{{self.ndim}}}" def make_node(self, x, shp): x = aet.as_tensor_variable(x) shp_orig = shp shp = aet.as_tensor_variable(shp, ndim=1) if not (shp.dtype in int_dtypes or (isinstance(shp, TensorConstant) and shp.data.size == 0)): # It raises an error if shp is not of integer type, # except when shp is constant and empty # (in this case, shp.dtype does not matter anymore). raise TypeError("Shape must be integers", shp, shp.dtype) assert shp.ndim == 1 if isinstance(shp, TensorConstant): bcast = [s == 1 for s in shp.data] return Apply(self, [x, shp], [tensor(x.type.dtype, bcast)]) else: bcasts = [False] * self.ndim shp_list = shp_orig if hasattr(shp_orig, "ndim") and shp_orig.ndim == 0: shp_list = [shp_orig] for index in range(self.ndim): y = shp_list[index] y = aet.as_tensor_variable(y) # Try to see if we can infer that y has a constant value of 1. # If so, that dimension should be broadcastable. try: bcasts[index] = (hasattr(y, "get_scalar_constant_value") and y.get_scalar_constant_value() == 1) except NotScalarConstantError: pass return Apply(self, [x, shp], [tensor(x.type.dtype, bcasts)]) def perform(self, node, inp, out_, params): x, shp = inp (out, ) = out_ if len(shp) != self.ndim: raise ValueError( ("shape argument to Reshape.perform has incorrect" f" length {len(shp)}" f", should be {self.ndim}"), shp, ) try: out[0] = np.reshape(x, shp) except Exception: raise ValueError( f"Cannot reshape input of shape {x.shape} to shape {shp}") def connection_pattern(self, node): return [[True], [False]] def grad(self, inp, grads): x, shp = inp (g_out, ) = grads return [reshape(g_out, shape(x), ndim=x.ndim), DisconnectedType()()] def R_op(self, inputs, eval_points): if eval_points[0] is None: return [None] return self(eval_points[0], *inputs[1:], **dict(return_list=True)) def infer_shape(self, fgraph, node, ishapes): from aesara.tensor.math import eq, maximum, mul # inputs[1] can contain at most one value of '-1', meaning the actual # shape of the output will be automatically computed by reshape, so # that the total number of elements stays the same. # TODO: Maybe put that formula here? # It's not trivial, because we would have to check if the product of # all the non-minus-one shapes is a divisor of the product of the # original shapes. # The following expression leads to cycles in feature_shape, # because it tries to replace the Shape_i node by the switch # statement, which depends on Shape_i. # return [tuple([switch(eq(node.inputs[1][i], -1), # Shape_i(i)(node.outputs[0]), # node.inputs[1][i]) # for i in range(self.ndim)] # )] # Here, we only simplify if the shape (node.inputs[1]) is a constant, # ideally it would suffice to check that it is always non-negative. # If current variable is a scalar and its dimensionality should # change to self.ndim, then use size 1 for all new dimensions. if len(ishapes[0]) == 0: return [(1, ) * self.ndim] requ = node.inputs[1] input_size = mul(*ishapes[0]) if isinstance(requ, TensorConstant): requ = list(requ.data) requ_part = [ele for ele in requ if ele != -1] crit = len(requ) - len(requ_part) if crit == 1 and len(requ_part) > 0: # If there are both 0 and -1 in requ_size, it is impossible # to determine a right output, but we can at least prevent # a division by 0. We do not want to keep a negative # size here as it could lead to further weird errors # after other optimizations. requ_size = mul(*requ_part) missing = input_size // (1 if requ_size == 0 else requ_size) for i, ele in enumerate(requ): if ele == -1: requ[i] = missing elif crit == 1: # we reshape to -1 requ = [input_size] if ishapes[0] else [1] elif crit > 1: raise ValueError("shape argument to Reshape.perform" " must have at most one entry equal to -1") return [requ] else: requ = [requ[i] for i in range(self.ndim)] # since new_dims can have negative value (-1), the # multiplication of all values should be negated # to give a positive value. # To avoid optimization complexity, we avoid checking # for the case when there are two or more '-1' values. if self.ndim: requ_size = -mul(*requ) # If there are both 0 and -1 in requ_size, it is impossible # to determine a right output, but we can at least prevent # a division by 0. We do not want to keep a negative # size here as it could lead to further weird errors # after other optimizations. rest_size = input_size // maximum(requ_size, 1) return [ tuple([ aet.switch(eq(requ[i], -1), rest_size, requ[i]) for i in range(self.ndim) ]) ] def c_code_cache_version(self): return (8, ) def c_code(self, node, name, inputs, outputs, sub): if isinstance(node.inputs[0], TensorVariable): x, shp = inputs (z, ) = outputs sdtype = node.inputs[1].type.dtype_specs()[1] fail = sub["fail"] params = sub["params"] return (""" assert (PyArray_NDIM(%(shp)s) == 1); npy_intp new_dims[%(params)s->ndim]; PyArray_Dims newshape; newshape.ptr = new_dims; newshape.len = %(params)s->ndim; for (int ii = 0; ii < %(params)s->ndim; ++ii) { // -- We do not want an explicit cast here. the shp can be any // -- int* dtype. The compiler will explicitly upcast it, but // -- will err if this will downcast. This could happen if the // -- user pass an int64 dtype, but npy_intp endup being int32. new_dims[ii] = ((%(sdtype)s*)( PyArray_BYTES(%(shp)s) + ii * PyArray_STRIDES(%(shp)s)[0]))[0]; } Py_XDECREF(%(z)s); %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape, NPY_CORDER); if (!%(z)s) { //The error message should have been set by PyArray_Newshape %(fail)s; } """ % locals()) else: raise NotImplementedError()
def test_hash_and_eq_params(self): wp1 = ParamsType( a=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) wp2 = ParamsType( a=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w1 = Params( wp1, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 == w2 assert not (w1 != w2) assert hash(w1) == hash(w2) # Changing attributes names only (a -> other_name). wp2_other = ParamsType( other_name=Generic(), array=TensorType("int64", (False, )), floatting=Scalar("float64"), npy_scalar=TensorType("float64", tuple()), ) w2 = Params( wp2_other, other_name=1, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing attributes values only (now a=2). w2 = Params( wp2, a=2, array=np.asarray([1, 2, 4, 5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2 # Changing NumPy array values (5 -> -5). w2 = Params( wp2, a=1, array=np.asarray([1, 2, 4, -5, 7]), floatting=-4.5, npy_scalar=np.asarray(12), ) assert w1 != w2
class QuadraticOpFunc(COp): __props__ = ("a", "b", "c") params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type) def __init__(self, a, b, c): self.a = a self.b = b self.c = c def make_node(self, x): x = at.as_tensor_variable(x) return Apply(self, [x], [x.type()]) def perform(self, node, inputs, output_storage, coefficients): x = inputs[0] y = output_storage[0] y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c def c_code_cache_version(self): return (1, 5) def c_support_code_apply(self, node, name): float_type = node.inputs[0].type.dtype_specs()[1] return """ /* Computes: x = a*x*x + b*x + c for x in tensor. */ int quadratic_%(name)s(PyArrayObject* tensor, %(float_type)s a, %(float_type)s b, %(float_type)s c) { NpyIter* iterator = NpyIter_New(tensor, NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, NPY_KEEPORDER, NPY_NO_CASTING, NULL); if(iterator == NULL) { PyErr_SetString(PyExc_RuntimeError, "Unable to iterate over a tensor for an elemwise operation."); return -1; } NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL); char** data_ptr = NpyIter_GetDataPtrArray(iterator); npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator); npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator); do { char* data = *data_ptr; npy_intp stride = *stride_ptr; npy_intp count = *innersize_ptr; while(count) { %(float_type)s x = *((%(float_type)s*)data); *((%(float_type)s*)data) = a*x*x + b*x + c; data += stride; --count; } } while(get_next(iterator)); NpyIter_Deallocate(iterator); return 0; } """ % { "name": name, "float_type": float_type, } def c_code(self, node, name, inputs, outputs, sub): return """ %(float_type)s a = (%(float_type)s) (*(npy_float64*) PyArray_GETPTR1(%(coeff)s->a, 0)); // 0-D TensorType. %(float_type)s b = %(coeff)s->b; // Scalar. %(float_type)s c = (%(float_type)s) PyFloat_AsDouble(%(coeff)s->c); // Generic. Py_XDECREF(%(Y)s); %(Y)s = (PyArrayObject*)PyArray_EMPTY(PyArray_NDIM(%(X)s), PyArray_DIMS(%(X)s), PyArray_TYPE(%(X)s), PyArray_IS_F_CONTIGUOUS(%(X)s)); if (PyArray_CopyInto(%(Y)s, %(X)s) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to copy input into output."); %(fail)s }; if (quadratic_%(name)s(%(Y)s, a, b, c) != 0) { PyErr_SetString(PyExc_RuntimeError, "Unable to compute quadratic function."); %(fail)s } """ % dict( name=name, coeff=sub["params"], fail=sub["fail"], X=inputs[0], Y=outputs[0], float_type=node.inputs[0].type.c_element_type(), )
class GpuMagmaSVD(GpuMagmaBase): """Computes the svd of a matrix :math:`A` using magma library. .. warning:: Because of implementation constraints, this Op returns outputs in order ``S, U, VT``. Use :func:`aesara.gpuarray.linalg.gpu_svd` to get them in expected order ``U, S, VT``. """ __props__ = ("full_matrices", "compute_uv") _cop_num_inputs = 1 _cop_num_outputs = 3 check_input = False params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type) def __init__(self, full_matrices=True, compute_uv=True): self.full_matrices = full_matrices self.compute_uv = compute_uv ExternalCOp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_uv: return Apply( self, [A], # return S, U, VT [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)(), A.type(), A.type(), ], ) else: return Apply( self, [A], # return only S [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)() ], ) def prepare_node(self, node, storage_map, compute_map, impl): super().prepare_node(node, storage_map, compute_map, impl) # Check node to prevent eventual errors with old pickled nodes. if self.compute_uv: A, B, C = node.outputs # We expect order: S (vector), U (matrix), VT (matrix) assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, ( "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order." ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def infer_shape(self, fgraph, node, shapes): (x_shape, ) = shapes M, N = x_shape K = tm.minimum(M, N) s_shape = (K, ) if self.compute_uv: u_shape = (M, M) if self.full_matrices else (M, K) vt_shape = (N, N) if self.full_matrices else (K, N) return [s_shape, u_shape, vt_shape] else: return [s_shape]
class GpuMagmaEigh(GpuMagmaBase): """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma library. Parameters ---------- UPLO : Specifies whether the calculation is done with the lower triangular part of matrix (`L`, default) or the upper triangular part (`U`). compute_v : If `True`, computes eigenvalues and eigenvectors (`True`, default). If `False`, computes only eigenvalues of matrix. """ __props__ = ("lower", "compute_v") _cop_num_inputs = 1 _cop_num_outputs = 2 check_input = False params_type = ParamsType(lower=bool_t, compute_v=bool_t, context=gpu_context_type) def __init__(self, UPLO="L", compute_v=True): assert UPLO in ["L", "U"] self.lower = UPLO == "L" self.compute_v = compute_v ExternalCOp.__init__(self, ["c_code/magma_eigh.c"], "APPLY_SPECIFIC(magma_eigh)") def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_v: return Apply( self, [A], # return D, V [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)(), A.type(), ], ) else: return Apply( self, [A], # return D [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)() ], ) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context)
def test_params_type_filtering(self): shape_tensor5 = (1, 2, 2, 3, 2) size_tensor5 = (shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[2] * shape_tensor5[3] * shape_tensor5[4]) random_tensor = np.random.normal( size=size_tensor5).reshape(shape_tensor5) w = ParamsType( a1=TensorType("int32", (False, False)), a2=TensorType("float64", (False, False, False, False, False)), a3=Generic(), ) # With a value that does not match the params type. o = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int64"), a2=random_tensor.astype("float32"), a3=2000, ) # should fail (o.a1 is not int32, o.a2 is not float64) with pytest.raises(TypeError): w.filter(o, True) # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed) with pytest.raises(TypeError): w.filter(o, False, False) # Should pass. w.filter(o, strict=False, allow_downcast=True) # With a value that matches the params type. o1 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=random_tensor.astype("float64"), a3=2000, ) # All should pass. w.filter(o1, strict=True) w.filter(o1, strict=False, allow_downcast=False) w.filter(o1, strict=False, allow_downcast=True) # Check values_eq and values_eq_approx. o2 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=random_tensor.astype("float64"), a3=2000, ) assert w.values_eq(o1, o2) assert w.values_eq_approx(o1, o2) # Check value_eq_approx. # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx. # So, I just play a little with float values. o3 = Params( w, a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype("int32"), a2=(random_tensor.astype("float32") * 10 / 2.2 * 2.19999999999 / 10).astype("float64"), a3=2000.0 - 0.00000000000000001, ) assert w.values_eq_approx(o1, o3)
class GpuMaxPoolRop(CGpuKernelBase): """ Implements the R-operator for the downsample operation. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border self.mode = mode CGpuKernelBase.__init__(self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)") assert mode == "max" assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self, **kwargs): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, eval_point, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 eval_point = as_gpuarray_variable(eval_point, ctx_name) assert eval_point.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0, ) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()]) def infer_shape(self, fgraph, node, in_shapes): ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]] shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim) return [shp]
class GpuPool(CGpuKernelBase): """ Implement the max and average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)") assert PoolingMode_t.has_alias(self.mode) assert self.ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self, **kwargs): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, ws, stride=None, pad=None): ctx_name = infer_context_name(inp) inp = as_gpuarray_variable(inp, ctx_name) nd = self.ndim assert inp.ndim == nd + 2 if stride is None: stride = ws if pad is None: pad = (0, ) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.ignore_border: raise ValueError("Padding works only with ignore_border=True") if isinstance(ws, (tuple, list)): if any(pad[i] >= ws[i] for i in range(nd)): raise ValueError("Padding must be smaller than strides") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, ws, stride, pad], [inp.type()]) def infer_shape(self, fgraph, node, in_shapes): ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]] shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim) return [shp] def grad(self, inp, grads): img, ws, stride, pad = inp (grad, ) = grads grad = gpu_contiguous(grad) disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]] if self.mode == "max": out = self(img, ws, stride, pad) g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)(img, out, grad, ws, stride, pad) return [g_out] + disc else: g_out = GpuAveragePoolGrad(ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode)(img, grad, ws, stride, pad) return [g_out] + disc def connection_pattern(self, node): return [[1], [0], [0], [0]] def R_op(self, inputs, eval_points): if self.mode != "max": # Rop for average or sum is simply pooling evaluated at eval point eval_inputs = [eval_points[0]] + inputs[1:] return [self(*eval_inputs)] # R_op can receive None as eval_points. # That mean there is no diferientiable path through that input # If this imply that you cannot compute some outputs, # return None for those. if eval_points[0] is None: return [None] z = self(*inputs) x, ws, stride, pad = inputs return [ GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)(x, z, eval_points[0], ws, stride, pad) ]
class CumOp(COp): # See function cumsum/cumprod for docstring __props__ = ("axis", "mode") check_input = False params_type = ParamsType(c_axis=int_t, mode=EnumList(("MODE_ADD", "add"), ("MODE_MUL", "mul"))) def __init__(self, axis=None, mode="add"): if mode not in ("add", "mul"): raise ValueError(f'{type(self).__name__}: Unknown mode "{mode}"') self.axis = axis self.mode = mode c_axis = property(lambda self: np.MAXDIMS if self.axis is None else self.axis) def make_node(self, x): x = aet.as_tensor_variable(x) out_type = x.type() if self.axis is None: out_type = vector(dtype=x.dtype) # Flatten elif self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError(f"axis(={self.axis}) out of bounds") return Apply(self, [x], [out_type]) def perform(self, node, inputs, output_storage, params): x = inputs[0] z = output_storage[0] if self.mode == "add": z[0] = np.cumsum(x, axis=self.axis) else: z[0] = np.cumprod(x, axis=self.axis) def grad(self, inputs, output_gradients): (x, ) = inputs (gi, ) = output_gradients if self.axis is None: if self.mode == "add": return [cumsum(gi[::-1])[::-1].reshape(x.shape)] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [cumsum((fx * gi)[::-1])[::-1].reshape(x.shape) / x] else: raise NotImplementedError( f'{type(self).__name__}: unknown gradient for mode "{self.mode}"' ) reverse_slicing = [slice(None, None, None)] * gi.ndim reverse_slicing[self.axis] = slice(None, None, -1) reverse_slicing = tuple(reverse_slicing) # We need to reverse the gradients along ``self.axis``, # compute cumsum, then reverse again if self.mode == "add": return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]] elif self.mode == "mul": fx = cumprod(x, axis=self.axis) return [ cumsum( (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x ] else: raise NotImplementedError( f'{type(self).__name__}: unknown gradient for mode "{self.mode}"' ) def infer_shape(self, fgraph, node, shapes): if self.axis is None: return [(prod(shapes[0]), )] # Flatten return shapes def c_code(self, node, name, inames, onames, sub): (x, ) = inames (z, ) = onames axis = self.axis fail = sub["fail"] params = sub["params"] code = (""" int axis = %(params)s->c_axis; if (axis == 0 && PyArray_NDIM(%(x)s) == 1) axis = NPY_MAXDIMS; npy_intp shape[1] = { PyArray_SIZE(%(x)s) }; if(axis == NPY_MAXDIMS && !(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0])) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s)); } else if(axis != NPY_MAXDIMS && !(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)))) { Py_XDECREF(%(z)s); %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE(%(x)s)); } if (!%(z)s) %(fail)s; { PyObject * t = NULL; if(%(params)s->mode == MODE_ADD) t = PyArray_CumSum( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); else if(%(params)s->mode == MODE_MUL) t = PyArray_CumProd( %(x)s, axis, PyArray_TYPE(%(x)s), %(z)s); if (!t){ %(fail)s; } // Because PyArray_CumSum/CumProd returns a newly created reference on t. Py_XDECREF(t); } """ % locals()) return code def c_code_cache_version(self): return (8, ) def __str__(self): return f"{self.__class__.__name__}{{{self.axis}, {self.mode}}}"
def params_type(self): return ParamsType(i=aesara.scalar.basic.int64)
class BaseCorrMM(OpenMPOp, _NoPythonOp): """ Base class for `CorrMM`, `CorrMM_gradWeights` and `CorrMM_gradInputs`. Cannot be used directly. Every sub-class must define internal attribute ``_direction`` out of __init__(). ``_direction`` must take one of following values: - "forward" to correlate bottom with weights and store results in top. - "backprop weights" to do a valid convolution of bottom with top (swapping the first two dimensions) and store results in weights. - "backprop inputs" to do a full convolution of top with weights (swapping the first two dimensions) and store results in bottom. Parameters ---------- border_mode : {'valid', 'full', 'half'} Additionally, the padding size could be directly specified by an integer, a pair of integers, or two pairs of integers. subsample Perform subsampling of the output (default: (1, 1)). filter_dilation Perform dilated correlation (default: (1,1)) num_groups Perform grouped convolutions (default: 1) unshared Perform unshared correlation (default: False) """ check_broadcast = False __props__ = ( "border_mode", "subsample", "filter_dilation", "num_groups", "unshared", ) _direction: Optional[str] = None params_type = ParamsType( direction=EnumList( ("DIRECTION_FORWARD", "forward"), # 0 ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"), # 1 ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"), ), # 2 dH=int64, dW=int64, dilH=int64, dilW=int64, padH_l=int64, padH_r=int64, padW_l=int64, padW_r=int64, num_groups=int64, unshared=int8, ) def __init__( self, border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None, ): super().__init__(openmp=openmp) if isinstance(border_mode, int): if border_mode < 0: raise ValueError("invalid border_mode {}, which must be a " "non-negative integer".format(border_mode)) border_mode = ((border_mode, border_mode), ) * 2 elif isinstance(border_mode, tuple): if len(border_mode) != 2: raise ValueError("invalid border_mode {} which must be a " "tuple of length 2".format(border_mode)) border = () for mode in border_mode: if isinstance(mode, tuple) and len(mode) == 2 and min(mode) >= 0: border += ((int(mode[0]), int(mode[1])), ) elif mode >= 0: border += ((int(mode), int(mode)), ) else: raise ValueError( "invalid border mode {}. The tuple can only contain " "integers or tuples of length 2".format(border_mode)) border_mode = border elif border_mode not in ("valid", "full", "half"): raise ValueError( "invalid border_mode {}, which must be either " '"valid", "full", "half", an integer or a tuple ' "of two integers or a pair of integers".format(border_mode)) self.border_mode = border_mode if len(subsample) != 2: raise ValueError("subsample must have two elements") if len(filter_dilation) != 2: raise ValueError("filter_dilation must have two elements") self.subsample = tuple(subsample) self.filter_dilation = tuple(filter_dilation) self.unshared = unshared if not config.blas__ldflags: # Aesara will use a NumPy C implementation of [sd]gemm_ instead. self.blas_type = "" else: if "openblas" in config.blas__ldflags: self.blas_type = "openblas" elif "mkl" in config.blas__ldflags: self.blas_type = "mkl" else: self.blas_type = "" if self._direction not in [ "forward", "backprop weights", "backprop inputs" ]: raise ValueError("_direction must be one of 'forward', " "'backprop weights', 'backprop inputs'") if num_groups < 1: raise ValueError("Number of groups should be greater than 0") self.num_groups = num_groups @property def pad(self): if self.border_mode == "half": return ((-1, -1), ) * 2 elif self.border_mode == "full": return ((-2, -2), ) * 2 elif isinstance(self.border_mode, tuple): return self.border_mode else: assert self.border_mode == "valid" return ((0, 0), ) * 2 # Direction should be converted to real enum value, # as it is compared to integer later in c_code_helper(). direction = property( lambda self: self.params_type.enum_from_alias(self._direction)) dH = property(lambda self: self.subsample[0]) dW = property(lambda self: self.subsample[1]) dilH = property(lambda self: self.filter_dilation[0]) dilW = property(lambda self: self.filter_dilation[1]) padH_l = property(lambda self: self.pad[0][0]) padH_r = property(lambda self: self.pad[0][1]) padW_l = property(lambda self: self.pad[1][0]) padW_r = property(lambda self: self.pad[1][1]) def __str__(self): return "{}{{{}, {}, {}, {} {}}}".format( self.__class__.__name__, self.border_mode, str(self.subsample), str(self.filter_dilation), str(self.num_groups), str(self.unshared), ) @staticmethod def as_common_dtype(in1, in2): """ Upcast input variables if necessary. """ dtype = aesara.scalar.upcast(in1.dtype, in2.dtype) return in1.astype(dtype), in2.astype(dtype) def __setstate__(self, d): self.__dict__.update(d) if not hasattr(self, "num_groups"): self.num_groups = 1 def c_support_code(self, **kwargs): ccodes = blas_headers.blas_header_text() if self.blas_type == "openblas": ccodes += blas_headers.openblas_threads_text() elif self.blas_type == "mkl": ccodes += blas_headers.mkl_threads_text() return ccodes def c_libraries(self, **kwargs): return ldflags() def c_compile_args(self, **kwargs): compile_args = ldflags(libs=False, flags=True) compile_args += super().c_compile_args(**kwargs) return compile_args def c_lib_dirs(self, **kwargs): return ldflags(libs=False, libs_dir=True) def c_header_dirs(self, **kwargs): return ldflags(libs=False, include_dir=True) def c_headers(self, **kwargs): headers = ["<stdio.h>"] headers += super().c_headers(**kwargs) return headers def c_code_cache_version(self): # raise this whenever modifying any of the support_code_files return (10, self.openmp, blas_header_version()) def c_support_code_apply(self, node, nodename): # REMEMBER TO RAISE c_code_cache_version when changing any of # these files sub = {} dtype = str(node.__dict__["inputs"][0].dtype) assert dtype in ("float32", "float64") if dtype == "float32": sub["gemm"] = "sgemm_" sub["gemv"] = "sgemv_" sub["float_type"] = "npy_float" sub["float_typenum"] = "NPY_FLOAT" sub["n_bytes"] = 4 sub["c_float_type"] = "float" else: sub["gemm"] = "dgemm_" sub["gemv"] = "dgemv_" sub["float_type"] = "npy_double" sub["float_typenum"] = "NPY_DOUBLE" sub["n_bytes"] = 8 sub["c_float_type"] = "double" if self.openmp: sub["omp_flags"] = "#pragma omp parallel for schedule(static)" sub["omp_get_max_threads"] = "omp_get_max_threads()" sub["omp_get_thread_num"] = "omp_get_thread_num()" if self.blas_type == "openblas": sub["blas_set_num_threads"] = "openblas_set_num_threads" sub["blas_get_num_threads"] = "openblas_get_num_threads()" elif self.blas_type == "mkl": sub["blas_set_num_threads"] = "mkl_set_num_threads" sub["blas_get_num_threads"] = "mkl_get_max_threads()" else: sub["blas_set_num_threads"] = "" sub["blas_get_num_threads"] = "0" else: sub["omp_flags"] = "" sub["omp_get_max_threads"] = "1" sub["omp_get_thread_num"] = "0" sub["blas_set_num_threads"] = "" sub["blas_get_num_threads"] = "0" final_code = "" with open( os.path.join( os.path.split(__file__)[0], os.path.join("c_code", "corr_gemm.c"))) as f: code = f.read() final_code += code return final_code % sub def c_code_helper(self, bottom, weights, top, sub, height=None, width=None): """ This generates the C code for CorrMM (direction="forward"), CorrMM_gradWeights (direction="backprop weights"), and CorrMM_gradInputs (direction="backprop inputs"). Depending on the direction, one of bottom, weights, top will receive the output, while the other two serve as inputs. :param bottom: Variable name of the input images in the forward pass, or the gradient of the input images in backprop wrt. inputs :param weights: Variable name of the filters in the forward pass, or the gradient of the filters in backprop wrt. weights :param top: Variable name of the output images / feature maps in the forward pass, or the gradient of the outputs in the backprop passes :param sub: Dictionary of substitutions useable to help generating the C code. :param height: If self.subsample[0] != 1, a variable giving the height of the filters for direction="backprop weights" or the height of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the height of the filters for direction="backprop weights". Ignored otherwise. :param width: If self.subsample[1] != 1, a variable giving the width of the filters for direction="backprop weights" or the width of the input images for direction="backprop inputs". If self.border_mode == 'half', a variable giving the width of the filters for direction="backprop weights". Ignored otherwise. """ # When subsampling, we cannot unambiguously infer the height and width # of bottom and weights from top, so we require them to be given. # Similarly, when border_mode="half", we cannot infer the weight size. if height: height = f"(*(npy_int64 *)(PyArray_DATA({height})))" else: if ((self.direction != 0) and (self.dH != 1)) or ((self.direction == 1) and (self.padH_l == -1 or self.padH_r == -1)): raise ValueError( "height must be given for backprop with vertical sampling or border_mode='half'" ) height = "-1" if width: width = f"(*(npy_int64 *)(PyArray_DATA({width})))" else: if ((self.direction != 0) and (self.dW != 1)) or ((self.direction == 1) and (self.padW_l == -1 or self.padW_r == -1)): raise ValueError( "width must be given for backprop with horizontal sampling or border_mode='half'" ) width = "-1" return """ // Mandatory args int direction = %(params)s->direction; // forward, bprop weights, bprop inputs // Optional args int dH = %(params)s->dH; int dW = %(params)s->dW; int dilH = %(params)s->dilH; int dilW = %(params)s->dilW; int padH_l = %(params)s->padH_l; int padH_r = %(params)s->padH_r; int padW_l = %(params)s->padW_l; int padW_r = %(params)s->padW_r; int numgroups = %(params)s->num_groups; int unshared = %(params)s->unshared; PyArrayObject * bottom = %(bottom)s; PyArrayObject * weights = %(weights)s; PyArrayObject * top = %(top)s; PyArrayObject * out2 = NULL; PyArrayObject **out = NULL; switch(%(params)s->direction) { case DIRECTION_FORWARD: out = &%(top)s; break; case DIRECTION_BACKPROP_WEIGHTS: out = &%(weights)s; break; case DIRECTION_BACKPROP_INPUTS: out = &%(bottom)s; break; default: PyErr_SetString(PyExc_ValueError, "CPU CorrMM: Invalid direction."); {%(fail)s} break; } int wdim, odim; wdim = unshared ? 6 : 4; odim = 4; //Can be set to 6 later for unshared backprop wrt weights // Obtain or infer kernel width and height // (we need to know it early to be able to handle auto-padding) int kH, kW, dil_kH, dil_kW; if (direction != 1) { // weight is an input variable, we can just read its shape kH = PyArray_DIMS(weights)[wdim-2]; kW = PyArray_DIMS(weights)[wdim-1]; } else { if (%(height)s != -1) { // kernel height is specified (perhaps vertical subsampling or half padding) kH = %(height)s; } else if (padH_l == -2 || padH_r == -2) { // vertical full padding, we can infer the kernel height kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1; } else { // explicit padding, we can infer the kernel height kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1; } if (%(width)s != -1) { // kernel width is specified (perhaps horizontal subsampling or half padding) kW = %(width)s; } else if (padW_l == -2 || padW_r == -2) { kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } else { kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; } } // Implicit dilated kernel size dil_kH = (kH - 1) * dilH + 1; dil_kW = (kW - 1) * dilW + 1; // Auto-padding if requested if (padH_l == -1 || padH_r == -1) { // vertical half padding padH_l = padH_r = dil_kH / 2; } else if (padH_l == -2 || padH_r == -2) { // vertical full padding padH_l = padH_r = dil_kH - 1; } else if (padH_l < -2 || padH_r < -2) { PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2"); %(fail)s } if (padW_l == -1 || padW_r == -1) { // horizontal half padding padW_l = padW_r = dil_kW / 2; } else if (padW_l == -2 || padW_r == -2) { // horizontal full padding padW_l = padW_r = dil_kW - 1; } else if (padW_l < -2 || padW_r < -2) { PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2"); %(fail)s } // Infer output shape npy_intp out_dim[6]; out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights switch(direction) { case 0: // forward pass // output is top: (batchsize, num_filters, height, width) // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1 out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0]; out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1); out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1); if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { if (unshared) { PyErr_Format(PyExc_ValueError, "CorrMM: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3]); } else { PyErr_Format(PyExc_ValueError, "CorrMM: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3]); } %(fail)s } break; case 1: // backprop wrt. weights // output is weights: (num_filters, num_channels, height, width) // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1 out_dim[0] = (npy_intp)PyArray_DIMS(top)[1]; if (unshared){ odim = 6; out_dim[1] = (npy_intp)PyArray_DIMS(top)[2]; out_dim[2] = (npy_intp)PyArray_DIMS(top)[3]; } out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups; out_dim[wdim-2] = (npy_intp)kH; // already inferred further above out_dim[wdim-1] = (npy_intp)kW; // how convenient if (unshared) { if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0 || out_dim[4] <= 0 || out_dim[5] <= 0){ PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. weights: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } else { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. weights: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1], (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3], (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } break; case 2: // backprop wrt. inputs // output is bottom: (batchsize, num_channels, height, width) // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad out_dim[0] = (npy_intp)PyArray_DIMS(top)[0]; out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups; out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r); out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r); if (unshared) { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. inputs: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } else { if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) { PyErr_Format(PyExc_ValueError, "CorrMM backprop wrt. inputs: impossible output shape\\n" " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" " weights shape: %%ld x %%ld x %%ld x %%ld\\n" " top shape: %%ld x %%ld x %%ld x %%ld\\n", (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2], (long int)out_dim[3], (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1], (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3], (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1], (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]); %(fail)s } } break; default: PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n"); %(fail)s } // Prepare output array int typenum; int failure; failure = !(*out && PyArray_NDIM(*out)==odim && PyArray_IS_C_CONTIGUOUS(*out) && PyArray_DIMS(*out)[0]==out_dim[0] && PyArray_DIMS(*out)[1]==out_dim[1] && PyArray_DIMS(*out)[2]==out_dim[2] && PyArray_DIMS(*out)[3]==out_dim[3]); if (odim == 6){ failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4] && PyArray_DIMS(*out)[5]==out_dim[5]); } if ( failure ) { Py_XDECREF(*out); if (direction != 1) { typenum = PyArray_TYPE(weights); } else { typenum = PyArray_TYPE(bottom); } //Change to PyArray_ZEROS which is faster than PyArray_EMPTY. *out = (PyArrayObject*)PyArray_ZEROS(odim, out_dim, typenum, 0); if (NULL == *out) { if (odim == 4) { PyErr_Format(PyExc_RuntimeError, "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld", (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]); } if (odim == 6) { PyErr_Format(PyExc_RuntimeError, "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld", (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4], (long long)out_dim[5]); } %(fail)s } } // Call corrMM code out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, numgroups, unshared); if (out2==NULL){ %(fail)s } assert (out2 == *out); """ % dict( bottom=bottom, weights=weights, top=top, height=height, width=width, fail=sub["fail"], params=sub["params"], )
class CheckAndRaise(COp): """An `Op` that checks conditions and raises an exception if they fail. This `Op` returns its "value" argument if its condition arguments are all ``True``; otherwise, it raises a user-specified exception. """ _f16_ok = True __props__ = ("msg", "exc_type") view_map = {0: [0]} check_input = False params_type = ParamsType(exc_type=exception_type) def __init__(self, exc_type, msg=""): if not issubclass(exc_type, Exception): raise ValueError("`exc_type` must be an Exception subclass") self.exc_type = exc_type self.msg = msg def __str__(self): return f"CheckAndRaise{{{self.exc_type}({self.msg})}}" def __eq__(self, other): if type(self) != type(other): return False if self.msg == other.msg and self.exc_type == other.exc_type: return True return False def __hash__(self): return hash((self.msg, self.exc_type)) def make_node(self, value: Variable, *conds: Tuple[Variable]): """ Parameters ========== value The value to return if `conds` all evaluate to ``True``; otherwise, `self.exc_type` is raised. conds The conditions to evaluate. """ import aesara.tensor as at if not isinstance(value, Variable): value = at.as_tensor_variable(value) conds = [at.as_tensor_variable(c) for c in conds] assert all(c.type.ndim == 0 for c in conds) return Apply( self, [value] + conds, [value.type()], ) def perform(self, node, inputs, outputs, params): (out, ) = outputs val, *conds = inputs out[0] = val if not np.all(conds): raise self.exc_type(self.msg) def grad(self, input, output_gradients): return output_gradients + [DisconnectedType()()] * (len(input) - 1) def connection_pattern(self, node): return [[1]] + [[0]] * (len(node.inputs) - 1) def c_code(self, node, name, inames, onames, props): value_name, *cond_names = inames out_name = onames[0] check = [] fail_code = props["fail"] param_struct_name = props["params"] msg = self.msg.replace('"', '\\"').replace("\n", "\\n") for idx, cond_name in enumerate(cond_names): check.append(f""" if(PyObject_IsTrue((PyObject *){cond_name}) == 0) {{ PyObject * exc_type = {param_struct_name}->exc_type; Py_INCREF(exc_type); PyErr_SetString(exc_type, "{msg}"); Py_XDECREF(exc_type); {indent(fail_code, " " * 4)} }} """) check = "\n".join(check) res = f""" {check} Py_XDECREF({out_name}); {out_name} = {value_name}; Py_INCREF({value_name}); """ return res def c_code_cache_version(self): return (1, 0) def infer_shape(self, fgraph, node, input_shapes): return [input_shapes[0]]
class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp): """ Images2Neibs for the GPU. """ params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type) def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, ten4, neib_shape, neib_step=None): ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4)) neib_shape = aet.as_tensor_variable(neib_shape) if neib_step is None: neib_step = neib_shape else: neib_step = aet.as_tensor_variable(neib_step) assert ten4.ndim == 4 assert neib_shape.ndim == 1 assert neib_step.ndim == 1 assert neib_shape.dtype in integer_dtypes assert neib_step.dtype in integer_dtypes return Apply( self, [ten4, neib_shape, neib_step], [ GpuArrayType( broadcastable=(False, False), dtype=ten4.type.dtype, context_name=ten4.type.context_name, )() ], ) def c_code_cache_version(self): return (14,) def c_headers(self, **kwargs): return ["<numpy_compat.h>", "<gpuarray/types.h>"] def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here. mode_constants = self.BORDER_MODE.c_support_code() kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """#include "cluda.h" // a version that uses less registers but doesn't work in all cases. %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED){ ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } }""" % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var) ) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """#include "cluda.h" %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED) { ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } } """ % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var) ) return kernels def c_support_code(self, **kwargs): return """ template <typename T> static T ceil_intdiv(T a, T b) { return (a/b) + ((a % b) ? 1: 0); } """ def c_code(self, node, name, inp, out, sub): err_check = """ if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: *fptr: %%s.", GpuKernel_error(fptr, err)); %(fail)s; } """ % dict( fail=sub["fail"] ) # NB: To reduce C code variability: # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node) # For typecode_z, I use ten4->ga.typecode (for same reason as above) return """ int grid_c = -1; int grid_d = -1; size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga); size_t itemsize_z = itemsize_ten4; int typecode_z = %(ten4)s->ga.typecode; { if (PyGpuArray_NDIM(%(ten4)s) != 4) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: pvals wrong rank"); %(fail)s; } if (PyArray_NDIM(%(neib_shape)s) != 1) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: unis wrong rank"); %(fail)s; } if (PyArray_DIMS(%(neib_shape)s)[0] != 2) { PyErr_Format(PyExc_ValueError, "GpuImages2Neibs: neib_shape has to contain two" " elements"); %(fail)s; } const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); if (step_x <=0 || step_y <=0) { PyErr_Format(PyExc_ValueError, "neib_step wrong step ; values <= 0. Got %%lld %%lld.", (long long) step_x, (long long) step_y); %(fail)s; } if (c <=0 || d <=0) { PyErr_Format(PyExc_ValueError, "neib_shape values <= 0. Got %%lld %%lld.", (long long)c, (long long)d); %(fail)s; } if (%(params)s->mode == MODE_WRAP_CENTERED) { if (c%%2!=1 || d%%2!=1){ PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes"); %(fail)s; } if ( PyGpuArray_DIMS(%(ten4)s)[2] < c || PyGpuArray_DIMS(%(ten4)s)[3] < d) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs: in wrap_centered mode," " don't support image shapes smaller then" " the patch shapes: neib_shape=(%%d,%%d)," " ten4[2:]=[%%d,%%d]", c, d, PyGpuArray_DIMS(%(ten4)s)[2], PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]), (size_t)step_x); grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]), (size_t)step_y); } else if (%(params)s->mode == MODE_VALID) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_IGNORE_BORDERS) { //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y); } else if (%(params)s->mode == MODE_HALF) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[0]=%%d, neib_step[0]=%%d and" " ten4.shape[2]=%%d not consistent", c, step_x, PyGpuArray_DIMS(%(ten4)s)[2]); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:" " neib_shape[1]=%%d, neib_step[1]=%%d and" " ten4.shape[3]=%%d not consistent", d, step_y, PyGpuArray_DIMS(%(ten4)s)[3]); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y); } else if (%(params)s->mode == MODE_FULL) { if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) || ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%ld, neib_step[0]=%%ld and" " ten4.shape[2]=%%ld not consistent", (long int)c, (long int)step_x, (long int)(PyGpuArray_DIMS(%(ten4)s)[2])); %(fail)s; } if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) || ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0)) { PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%ld, neib_step[1]=%%ld and" " ten4.shape[3]=%%ld not consistent", (long int)d, (long int)step_y, (long int)(PyGpuArray_DIMS(%(ten4)s)[3])); %(fail)s; } //number of patch in height grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x); //number of patch in width grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y); } else { PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode); %(fail)s; } // new dimensions for z const int z_dim1 = c * d; const int z_dim0 = grid_c * grid_d * PyGpuArray_DIMS(%(ten4)s)[1] * PyGpuArray_DIMS(%(ten4)s)[0]; if ((NULL == %(z)s) || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0) || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1)) { Py_XDECREF(%(z)s); size_t dims[2]; dims[0] = z_dim0; dims[1] = z_dim1; %(z)s = pygpu_empty(2, dims, typecode_z, GA_C_ORDER, %(params)s->context, Py_None); if (!%(z)s) { PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" " failed to alloc z output"); %(fail)s; } } } { // NESTED SCOPE const int mode = %(params)s->mode; const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0]; const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1]; const int height = PyGpuArray_DIMS(%(ten4)s)[2]; const int width = PyGpuArray_DIMS(%(ten4)s)[3]; const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 0); const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1( %(neib_shape)s, 1); const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); size_t threads_per_block[3] = {d, c, 1}; //get the max threads per blocks size_t max_threads_dim; int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims"); %(fail)s; } while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--; while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--; //Make bigger block to have better memory access pattern and //a higher core utilisation. for smaller patch size while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 && threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){ threads_per_block[2]++; } int nb_block; if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0) nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]; else nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1; size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1}; GpuKernel *fptr; if(threads_per_block[0]==d && threads_per_block[1]==c){ fptr = &k_multi_warp_less_%(name)s; }else{ fptr = &k_multi_warp_%(name)s; } /* printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n", max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1], n_blocks[2]); */ size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4; size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4; size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4; size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4; size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z; size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z; void *kernel_params[] = {(void *)&mode, (void *)&nb_batch, (void *)&nb_stack, (void *)&height, (void *)&width, (void *)&c, (void *)&d, (void *)&step_x, (void *)&step_y, (void *)&grid_c, (void *)&grid_d, (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2, (void *)&stride_A3, (void *)%(ten4)s->ga.data, (void *)&%(ten4)s->ga.offset, (void *)&stride_Z0, (void *)&stride_Z1, (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset}; err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params); %(err_check)s } // END NESTED SCOPE """ % dict( ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0], dtype_neib_shape=node.inputs[1].dtype, dtype_neib_step=node.inputs[2].dtype, err_check=err_check, name=name, params=sub["params"], fail=sub["fail"], )
class GpuAveragePoolGrad(CGpuKernelBase): """ Implement the grad of average pooling on the gpu. """ __props__ = ("ignore_border", "mode", "ndim") params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type) def __init__(self, ignore_border, mode="max", ndim=2): self.ndim = ndim self.ignore_border = ignore_border if mode == "average": mode = "average_inc_pad" self.mode = mode CGpuKernelBase.__init__(self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)") assert mode in ("sum", "average_inc_pad", "average_exc_pad") assert ndim in [2, 3] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_headers(self, **kwargs): return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"] def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir(), pygpu.get_include()] def make_node(self, inp, out_grad, ws, stride=None, pad=None): ctx_name = infer_context_name(inp, out_grad) nd = self.ndim inp = as_gpuarray_variable(inp, ctx_name) assert inp.ndim == nd + 2 out_grad = as_gpuarray_variable(out_grad, ctx_name) assert out_grad.ndim == nd + 2 assert out_grad.ndim == inp.ndim if stride is None: stride = ws if pad is None: pad = (0, ) * nd elif isinstance(pad, (tuple, list)): if max(pad) != 0 and not self.mode == "average_exc_pad": raise ValueError("Padding must be zero for average_exc_pad") ws = as_tensor_variable(ws) stride = as_tensor_variable(stride) pad = as_tensor_variable(pad) assert ws.ndim == stride.ndim and ws.ndim == pad.ndim assert ws.ndim == 1 if ws.dtype not in int_dtypes: raise TypeError("Window shape parameters must be ints.") if stride.dtype not in int_dtypes: raise TypeError("Stride parameters must be ints.") if pad.dtype not in int_dtypes: raise TypeError("Padding parameters must be ints.") ws = aesara.tensor.cast(ws, "int64") stride = aesara.tensor.cast(stride, "int64") pad = aesara.tensor.cast(pad, "int64") return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()]) def infer_shape(self, fgraph, node, in_shapes): return [in_shapes[0]] def grad(self, inp, grads): x, gz, ws, stride, pad = inp (ggx, ) = grads return [ aesara.tensor.zeros_like(x), GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)(ggx, ws, stride, pad), ] + [aesara.gradient.DisconnectedType()() for i in inp[2:]] def connection_pattern(self, node): return [[1], [1], [0], [0], [0]]
class GpuSparseBlockGemv(_NoPythonExternalCOp): """ GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more information. This should not be directly called since the interface is subject to change without notice. Use the sandbox.blocksparse.sparse_block_dot() function for a stable interface. """ __props__ = ("inplace", ) params_type = ParamsType(inplace=bool_t, context=gpu_context_type) # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False. def __init__(self, inplace=False): super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)") self.inplace = inplace if self.inplace: self.destroy_map = {0: [0]} def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir()] def c_headers(self, **kwargs): return [ "<gpuarray/buffer_blas.h>", "<gpuarray/buffer.h>", "<gpuarray_helper.h>", ] def make_node(self, o, W, h, inputIdx, outputIdx): ctx = infer_context_name(o, W, h) o = as_gpuarray_variable(o, ctx) W = as_gpuarray_variable(W, ctx) h = as_gpuarray_variable(h, ctx) inputIdx = as_tensor_variable(inputIdx) outputIdx = as_tensor_variable(outputIdx) assert o.ndim == 3 assert W.ndim == 4 assert h.ndim == 3 assert inputIdx.ndim == 2 assert outputIdx.ndim == 2 assert inputIdx.type.dtype in discrete_dtypes assert outputIdx.type.dtype in discrete_dtypes return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()]) def infer_shape(self, fgraph, node, input_shapes): return [input_shapes[0]] def grad(self, inputs, grads): o, W, h, inputIdx, outputIdx = inputs go = grads[0] Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx) hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle( (1, 0, 3, 2)), go, outputIdx, inputIdx) return [ go, Wgrad, hgrad, grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"), grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"), ]
class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp): """ Parameters ---------- axis Can not be None. If you want the array flattened, do it before. """ SUPPORTED_NDIMS = 3 __props__ = ("axis", "mode") params_type = ParamsType(axis=scalar.int32, context=gpu_context_type) def __init__(self, axis, mode="add"): assert axis is not None self.axis = int(axis) self.mode = mode def __eq__(self, other): if type(other) != type(self): return False return self.axis == other.axis and self.mode == other.mode def __hash__(self): return hash(self.axis) ^ hash(self.mode) def c_code_cache_version(self): return (7, ) def c_headers(self, **kwargs): return [ "<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>" ] def c_header_dirs(self, **kwargs): return [gpuarray_helper_inc_dir()] def get_params(self, node): return self.params_type.get_params(self, context=node.inputs[0].type.context) def make_node(self, x): assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp" context_name = infer_context_name(x) x = as_gpuarray_variable(x, context_name) if x.ndim > GpuCumOp.SUPPORTED_NDIMS: raise NotImplementedError("Only cum op on 1D, 2D and\ 3D arrays are supported right now!") if self.axis >= x.ndim or self.axis < -x.ndim: raise ValueError(f"axis(={self.axis}) out of bounds") return Apply(self, [x], [x.type()]) def gpu_kernels(self, node, nodename): kernels = [] # cumadd kname = "k_cumadd" op = {"mul": "*", "add": "+"}[self.mode] k_var = "k_cumadd_" + nodename dtype_x = node.inputs[0].dtype flags = Kernel.get_flags(dtype_x) code = ("""#include "cluda.h" KERNEL void %(kname)s(float* input, ga_size input_offset, float* output, ga_size output_offset, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, const int offsetY, const int offsetZ, const int beforeLastElementIdx, const int lastElementIdx){ input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z; int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z; int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input; int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output; int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast]; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "intc", "intc", "intc", "intc", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # blockCumOp kname = "k_blockCumOp" k_var = "k_blockCumOp_" + nodename params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", gpuarray.GpuArray, gpuarray.SIZE, ] code = ("""#include "cluda.h" // helper functions WITHIN_KERNEL void k_reductionPhase(float* partialCumOp) { // Traverse down from leaves to root building partial sums at internal nodes in the tree. for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index < blockDim.x*2) { partialCumOp[index] %(op)s= partialCumOp[index - stride]; } } } WITHIN_KERNEL void k_fetchData(float* partialCumOp, float* input, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; partialCumOp[threadIdx.x*2] = input[idx_even]; partialCumOp[threadIdx.x*2 + 1] = input[idx_odd]; } WITHIN_KERNEL void k_reversePhase(float* partialCumOp) { // Traverse back up the tree building the scan from the partial sums for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { local_barrier(); unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; if (index + stride < blockDim.x*2) { partialCumOp[index + stride] %(op)s= partialCumOp[index]; } } } WITHIN_KERNEL void k_pushData(float* partialCumOp, float* output, int globalThreadID, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { local_barrier(); // blockIdx.y and blockIdx.z represents the current independent cum op int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] = partialCumOp[threadIdx.x*2]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1]; } KERNEL void k_blockCumOp(float* input, ga_size input_offset, float* output, ga_size output_offset, size_t nbElementsPerCumOp, ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z, int offsetY, int offsetZ, float* blockSum, ga_size blockSum_offset) { input = (float *)(((char *)input) + input_offset); output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case. int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; // Check if current thread has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) { return; } extern __shared__ float partialCumOp[]; // Load data in shared memory k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ); // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree). // The tree is sweeped from the leaves to the root and from the root to the leaves. // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf k_reductionPhase(partialCumOp); k_reversePhase(partialCumOp); // Write the final output to global memory k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (blockSum != NULL){ if (threadIdx.x == blockDim.x - 1) { blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1]; } } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) # k_finalCumOp kname = "k_finalCumOp" k_var = "k_finalCumOp_" + nodename code = ("""#include "cluda.h" KERNEL void k_finalCumOp(float* output, ga_size output_offset, float* blockSum, ga_size blockSum_offset, size_t nbElementsPerCumOp, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, int offsetY, int offsetZ) { output = (float *)(((char *)output) + output_offset); blockSum = (float *)(((char *)blockSum) + blockSum_offset); int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; // Check if current has data to process. if (globalThreadID >= (nbElementsPerCumOp+1)/2) return; int idY = blockIdx.y + offsetY; int idZ = blockIdx.z + offsetZ; const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ]; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; output[idx_even] %(op)s= currentBlockSum; output[idx_odd] %(op)s= currentBlockSum; } """ % locals()) params = [ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, "int32", "int32", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels def c_code(self, node, nodename, inp, out, sub): if node.inputs[0].type.context.kind != b"cuda": raise NotImplementedError("cuda only") return """ const size_t* shape = PyGpuArray_DIMS(%(x)s); bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s); int axis = %(params)s->axis; if (axis < 0) { // Convert negative axis to positive axis. axis += PyGpuArray_NDIM(%(x)s); } if (aesara_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) { %(fail)s; } { // Namespace for kernel calls // size_t max_threads_dim0; size_t max_grid_size1; size_t max_grid_size2; int err; err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1"); %(fail)s; } err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2"); %(fail)s; } if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){ %(fail)s; } } """ % dict( x=inp[0], z=out[0], nodename=nodename, fail=sub["fail"], params=sub["params"], ) def c_support_code_struct(self, node, nodename): code = (""" int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) { size_t shape[3] = { 1, 1, 1 }; ssize_t inputStrides_x; ssize_t inputStrides_y; ssize_t inputStrides_z; ssize_t outputStrides_x; ssize_t outputStrides_y; ssize_t outputStrides_z; switch (PyGpuArray_NDIM(input)) { case 1: shape[0] = PyGpuArray_DIMS(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); break; case 2: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); break; case 3: shape[0] = PyGpuArray_DIMS(input)[0]; shape[1] = PyGpuArray_DIMS(input)[1]; shape[2] = PyGpuArray_DIMS(input)[2]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float); inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float); inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float); outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float); outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float); outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float); break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } if (shape[axis] <= 1) { int err = pygpu_move(output, input); return err; } // Perform cum op on array of even size. size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2); // Determine how many elements can be processed in one block. size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2; // Determine how many blocks are needed in total. size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op. size_t dimGridY; // Nb. of independent cum ops (width). size_t dimGridZ; // Nb. of independent cum ops (height). ssize_t tmp; switch (axis) { case 0: dimGridY = shape[1]; dimGridZ = shape[2]; break; case 1: dimGridY = shape[0]; dimGridZ = shape[2]; tmp = inputStrides_x; inputStrides_x = inputStrides_y; inputStrides_y = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_y; outputStrides_y = tmp; break; case 2: dimGridY = shape[1]; dimGridZ = shape[0]; tmp = inputStrides_x; inputStrides_x = inputStrides_z; inputStrides_z = tmp; tmp = outputStrides_x; outputStrides_x = outputStrides_z; outputStrides_z = tmp; break; default: PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis"); return -1; } const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ }; PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode, GA_C_ORDER, input->context, Py_None); if (deviceBlockSum == NULL){ return -1; } // Perform `maxGridY`*`maxGridZ` cum ops in parallel. for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){ size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY); for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ); size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t sharedBytes = (2*dimBlockX) * sizeof(float); int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed"); return -1; } if (dimGridX > 1) { // Do a cum op over the blockSum (recursive). if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){ Py_DECREF(deviceBlockSum); return -1; } // Since there are more than one block (i.e. `dimGridX > 1`) // report partial cum ops of previous blocks to subsequents ones. size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {dimBlockX, 1, 1}; int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed"); return -1; } } // If shape[axis] is odd, the last element is compute manually if (shape[axis] != nbElementsPerCumOp){ size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimBlock[3] = {1, 1, 1}; int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1); if (err != GA_NO_ERROR){ PyErr_SetString(PyExc_RuntimeError, "cumadd call failed"); return -1; } } } } Py_XDECREF(deviceBlockSum); return 0; } """ % locals()) return super().c_support_code_struct(node, nodename) + code