Пример #1
0
class GpuMagmaMatrixInverse(GpuMagmaBase):
    """Computes the inverse of a matrix :math:`A` using magma library."""

    __props__ = ("inplace", )
    check_input = False
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
        ExternalCOp.__init__(self, ["c_code/magma_inv.c"],
                             "APPLY_SPECIFIC(magma_inv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def clone_inplace(self):
        return self.__class__(inplace=True)

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        return Apply(self, [A], [A.type()])

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def infer_shape(self, fgraph, node, shapes):
        return shapes
Пример #2
0
class CGemv(BaseBLAS, Gemv):
    params_type = ParamsType(
        inplace=bool_t,
    )

    def __init__(self, inplace):
        super().__init__(inplace)

    def c_code(self, node, name, inp, out, sub):
        y, alpha, A, x, beta = inp
        (z,) = out
        code = gemv_c_code(
            y,
            A,
            x,
            z,
            alpha,
            beta,
            fail=sub["fail"],
            force_init_beta=check_force_gemv_init(),
            params=sub["params"],
        )
        return code

    def c_code_cache_version(self):
        return (14, blas_header_version(), check_force_gemv_init())
Пример #3
0
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
    """Computes the qr decomposition of a matrix :math:`A` using magma
    library.

    Parameters
    ----------

        complete : If False, returns only ``R``.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``R, Q``. Use :func:`aesara.gpuarray.linalg.gpu_qr`
        to get them in expected order ``Q, R``.
    """

    __props__ = ("complete", )
    _cop_num_inputs = 1
    _cop_num_outputs = 2
    check_input = False
    params_type = ParamsType(complete=bool_t, context=gpu_context_type)

    def __init__(self, complete=True):
        self.complete = complete
        ExternalCOp.__init__(self, ["c_code/magma_qr.c"],
                             "APPLY_SPECIFIC(magma_qr)")

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        if self.complete:
            return Apply(
                self,
                [A],
                # return R, Q
                [A.type(), A.type()],
            )
        else:
            return Apply(
                self,
                [A],
                # return R
                [A.type()],
            )

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)
Пример #4
0
class CGer(BaseBLAS, Ger):
    params_type = ParamsType(
        destructive=bool_t,
    )

    def c_code(self, node, name, inp, out, sub):
        A, a, x, y = inp
        (Z,) = out
        code = ger_c_code(A, a, x, y, Z, fail=sub["fail"], params=sub["params"])
        return code

    def c_code_cache_version(self):
        return (11, blas_header_version())
Пример #5
0
 def test_hash_and_eq_params_type(self):
     w1 = ParamsType(
         a1=TensorType("int64", (False, False)),
         a2=TensorType("int64", (False, True, False, False, True)),
         a3=Generic(),
     )
     w2 = ParamsType(
         a1=TensorType("int64", (False, False)),
         a2=TensorType("int64", (False, True, False, False, True)),
         a3=Generic(),
     )
     assert w1 == w2
     assert not (w1 != w2)
     assert hash(w1) == hash(w2)
     assert w1.name == w2.name
     # Changing attributes names only.
     w2 = ParamsType(
         a1=TensorType("int64", (False, False)),
         other_name=TensorType(
             "int64",
             (False, True, False, False, True)),  # a2 -> other_name
         a3=Generic(),
     )
     assert w1 != w2
     # Changing attributes types only.
     w2 = ParamsType(
         a1=TensorType("int64", (False, False)),
         a2=Generic(),  # changing class
         a3=Generic(),
     )
     assert w1 != w2
     # Changing attributes types characteristics only.
     w2 = ParamsType(
         a1=TensorType("int64", (False, True)),  # changing broadcasting
         a2=TensorType("int64", (False, True, False, False, True)),
         a3=Generic(),
     )
     assert w1 != w2
Пример #6
0
class GpuEye(CGpuKernelBase):
    """Eye for GPU.

    This is an implementation to test that `CGpuKernelBase` works and also
    to use as an example in the docs.  It is not used for user graphs.

    """

    __props__ = ("dtype", "context_name")
    params_type = ParamsType(typecode=int_t, context=gpu_context_type)

    def __init__(self, dtype=None, context_name=None):
        if dtype is None:
            dtype = config.floatX
        self.dtype = dtype
        self.context_name = context_name
        super().__init__(["c_code/tstgpueye.c"], "APPLY_SPECIFIC(tstgpueye)")

    def get_params(self, node):
        pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray")

        return self.params_type.get_params(
            typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype),
            context=get_context(self.context_name),
        )

    def c_headers(self, **kwargs):
        return ["<gpuarray/types.h>", "<gpuarray/kernel.h>"]

    def make_node(self, n, m):
        n = aet.as_tensor_variable(n)
        m = aet.as_tensor_variable(m)
        assert n.ndim == 0
        assert m.ndim == 0
        otype = GpuArrayType(
            dtype=self.dtype,
            broadcastable=(False, False),
            context_name=self.context_name,
        )

        return Apply(self, [n, m], [otype()])

    def infer_shape(self, fgraph, node, in_shapes):
        out_shape = [node.inputs[0], node.inputs[1]]
        return [out_shape]

    def grad(self, inp, grads):
        return [grad_undefined(self, i, inp[i]) for i in range(2)]
Пример #7
0
class GpuSparseBlockOuter(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
    information.

    This op should not be called directly since its interface is
    subject to change without notice.  It is involved in the gradient
    of GpuSparseBlockGemv. The gradient is not implemented.
    """

    __props__ = ("inplace", )
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    def __init__(self, inplace=False):
        super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
        ctx = infer_context_name(o, x, y)
        one = aet.constant(np.asarray(1.0, dtype="float32"))
        o = as_gpuarray_variable(o, ctx)
        x = as_gpuarray_variable(x, ctx)
        y = as_gpuarray_variable(y, ctx)
        xIdx = as_tensor_variable(xIdx)
        yIdx = as_tensor_variable(yIdx)
        if alpha is None:
            alpha = one
        return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])

    def infer_shape(self, fgraph, node, input_shapes):
        return [input_shapes[0]]

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir()]

    def c_headers(self, **kwargs):
        return [
            "<gpuarray/buffer_blas.h>",
            "<gpuarray/buffer.h>",
            "<gpuarray_helper.h>",
        ]
Пример #8
0
class mrg_uniform_base(Op):
    # TODO : need description for class, parameter
    __props__ = ("output_type", "inplace")
    params_type = ParamsType(
        inplace=bool_t,
        # following params will come from self.output_type.
        # NB: As output object may not be allocated in C code,
        # we can not be sure to get these properties from output.
        # So, we should better get them as params from self.output_type.
        ndim=int_t,
        otypenum=int_t,
        otype_is_float32=bool_t,
    )

    def __init__(self, output_type, inplace=False):
        Op.__init__(self)
        self.output_type = output_type
        self.inplace = inplace
        if inplace:
            self.destroy_map = {0: [0]}
        self.warned_numpy_version = False

    # These attributes (used as params) are created as properties
    # to make them available even for old pickled objects, e.g.
    # when testing old interface or when using FAST_COMPILE mode.
    ndim = property(lambda self: self.output_type.ndim)
    otypenum = property(lambda self: np.dtype(self.output_type.dtype).num)
    otype_is_float32 = property(
        lambda self: self.output_type.dtype == "float32")

    def __str__(self):
        if self.inplace:
            s = "inplace"
        else:
            s = "no_inplace"
        return self.__class__.__name__ + f"{{{self.output_type},{s}}}"

    def grad(self, inputs, ograd):
        return [
            gradient.grad_undefined(
                self, k, inp, "No gradient defined through "
                "random sampling op") for k, inp in enumerate(inputs)
        ]

    def R_op(self, inputs, eval_points):
        return [None for i in eval_points]
Пример #9
0
    def test_params_type_with_enums(self):
        # Test that we fail if we create a params type with common enum names inside different enum types.
        try:
            ParamsType(enum1=EnumList("A", "B", "C"),
                       enum2=EnumList("A", "B", "F"))
        except AttributeError:
            pass
        else:
            raise Exception(
                "ParamsType should fail with common enum names inside different enum types."
            )

        # Test that we fail if we create a params type with common names in both aliases and constants.
        try:
            ParamsType(
                enum1=EnumList(("A", "a"), ("B", "b")),
                enum2=EnumList(("ONE", "a"), ("TWO", "two")),
            )
        except AttributeError:
            ParamsType(
                enum1=EnumList(("A", "a"), ("B", "b")),
                enum2=EnumList(("ONE", "one"), ("TWO", "two")),
            )
        else:
            raise Exception(
                "ParamsType should fail when there are aliases with same names as some constants."
            )

        # Test that we can access enum values through wrapper directly.
        w = ParamsType(
            enum1=EnumList("A", ("B", "beta"), "C"),
            enum2=EnumList(("D", "delta"), "E", "F"),
        )
        assert w.A == 0 and w.B == 1 and w.C == 2
        assert w.D == 0 and w.E == 1 and w.F == 2
        # Test constants access through aliases.
        assert w.enum_from_alias("beta") == w.B
        assert w.enum_from_alias("delta") == w.D
        assert (w.enum_from_alias("C") == w.C
                )  # C is not an alias, so it should return a constant named C.
        # Test that other regular wrapper attributes are still available.
        assert len(w.fields) == len(w.types) == w.length
        assert w.name
Пример #10
0
class QuadraticCOpFunc(ExternalCOp):
    __props__ = ("a", "b", "c")
    params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type)

    def __init__(self, a, b, c):
        super().__init__("c_code/test_quadratic_function.c",
                         "APPLY_SPECIFIC(compute_quadratic)")
        self.a = a
        self.b = b
        self.c = c

    def make_node(self, x):
        x = at.as_tensor_variable(x)
        return Apply(self, [x], [x.type()])

    def perform(self, node, inputs, output_storage, coefficients):
        x = inputs[0]
        y = output_storage[0]
        y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c
Пример #11
0
class Reshape(COp):
    """Perform a reshape operation of the input x to the new shape shp.
    The number of dimensions to which to reshape to (ndim) must be
    known at graph build time.
    """

    view_map = {0: [0]}  # output 0 is potentially aliased to inputs [0]
    _f16_ok = True

    check_input = False
    __props__ = ("ndim", )
    params_type = ParamsType(ndim=int32)

    # name does not participate because it doesn't affect computations

    def __init__(self, ndim, name=None):
        self.ndim = int(ndim)
        if ndim < 0:
            raise ValueError(
                "The output dimensions after reshape must be 0 or greater")
        assert name is None, "name attribute for Reshape has been deprecated"

    def __str__(self):
        return f"{self.__class__.__name__}{{{self.ndim}}}"

    def make_node(self, x, shp):
        x = aet.as_tensor_variable(x)
        shp_orig = shp
        shp = aet.as_tensor_variable(shp, ndim=1)
        if not (shp.dtype in int_dtypes or
                (isinstance(shp, TensorConstant) and shp.data.size == 0)):
            # It raises an error if shp is not of integer type,
            # except when shp is constant and empty
            # (in this case, shp.dtype does not matter anymore).
            raise TypeError("Shape must be integers", shp, shp.dtype)
        assert shp.ndim == 1
        if isinstance(shp, TensorConstant):
            bcast = [s == 1 for s in shp.data]
            return Apply(self, [x, shp], [tensor(x.type.dtype, bcast)])
        else:
            bcasts = [False] * self.ndim
            shp_list = shp_orig
            if hasattr(shp_orig, "ndim") and shp_orig.ndim == 0:
                shp_list = [shp_orig]
            for index in range(self.ndim):
                y = shp_list[index]
                y = aet.as_tensor_variable(y)
                # Try to see if we can infer that y has a constant value of 1.
                # If so, that dimension should be broadcastable.
                try:
                    bcasts[index] = (hasattr(y, "get_scalar_constant_value")
                                     and y.get_scalar_constant_value() == 1)
                except NotScalarConstantError:
                    pass
            return Apply(self, [x, shp], [tensor(x.type.dtype, bcasts)])

    def perform(self, node, inp, out_, params):
        x, shp = inp
        (out, ) = out_
        if len(shp) != self.ndim:
            raise ValueError(
                ("shape argument to Reshape.perform has incorrect"
                 f" length {len(shp)}"
                 f", should be {self.ndim}"),
                shp,
            )
        try:
            out[0] = np.reshape(x, shp)
        except Exception:
            raise ValueError(
                f"Cannot reshape input of shape {x.shape} to shape {shp}")

    def connection_pattern(self, node):
        return [[True], [False]]

    def grad(self, inp, grads):
        x, shp = inp
        (g_out, ) = grads
        return [reshape(g_out, shape(x), ndim=x.ndim), DisconnectedType()()]

    def R_op(self, inputs, eval_points):
        if eval_points[0] is None:
            return [None]
        return self(eval_points[0], *inputs[1:], **dict(return_list=True))

    def infer_shape(self, fgraph, node, ishapes):
        from aesara.tensor.math import eq, maximum, mul

        # inputs[1] can contain at most one value of '-1', meaning the actual
        # shape of the output will be automatically computed by reshape, so
        # that the total number of elements stays the same.
        # TODO: Maybe put that formula here?
        # It's not trivial, because we would have to check if the product of
        # all the non-minus-one shapes is a divisor of the product of the
        # original shapes.
        # The following expression leads to cycles in feature_shape,
        # because it tries to replace the Shape_i node by the switch
        # statement, which depends on Shape_i.
        # return [tuple([switch(eq(node.inputs[1][i], -1),
        #                      Shape_i(i)(node.outputs[0]),
        #                      node.inputs[1][i])
        #                    for i in range(self.ndim)]
        #    )]
        # Here, we only simplify if the shape (node.inputs[1]) is a constant,
        # ideally it would suffice to check that it is always non-negative.
        # If current variable is a scalar and its dimensionality should
        # change to self.ndim, then use size 1 for all new dimensions.
        if len(ishapes[0]) == 0:
            return [(1, ) * self.ndim]

        requ = node.inputs[1]
        input_size = mul(*ishapes[0])
        if isinstance(requ, TensorConstant):
            requ = list(requ.data)
            requ_part = [ele for ele in requ if ele != -1]
            crit = len(requ) - len(requ_part)
            if crit == 1 and len(requ_part) > 0:
                # If there are both 0 and -1 in requ_size, it is impossible
                # to determine a right output, but we can at least prevent
                # a division by 0. We do not want to keep a negative
                # size here as it could lead to further weird errors
                # after other optimizations.
                requ_size = mul(*requ_part)
                missing = input_size // (1 if requ_size == 0 else requ_size)
                for i, ele in enumerate(requ):
                    if ele == -1:
                        requ[i] = missing
            elif crit == 1:  # we reshape to -1
                requ = [input_size] if ishapes[0] else [1]
            elif crit > 1:
                raise ValueError("shape argument to Reshape.perform"
                                 " must have at most one entry equal to -1")
            return [requ]
        else:

            requ = [requ[i] for i in range(self.ndim)]
            # since new_dims can have negative value (-1), the
            # multiplication of all values should be negated
            # to give a positive value.
            # To avoid optimization complexity, we avoid checking
            # for the case when there are two or more '-1' values.
            if self.ndim:
                requ_size = -mul(*requ)
                # If there are both 0 and -1 in requ_size, it is impossible
                # to determine a right output, but we can at least prevent
                # a division by 0. We do not want to keep a negative
                # size here as it could lead to further weird errors
                # after other optimizations.
                rest_size = input_size // maximum(requ_size, 1)
            return [
                tuple([
                    aet.switch(eq(requ[i], -1), rest_size, requ[i])
                    for i in range(self.ndim)
                ])
            ]

    def c_code_cache_version(self):
        return (8, )

    def c_code(self, node, name, inputs, outputs, sub):
        if isinstance(node.inputs[0], TensorVariable):
            x, shp = inputs
            (z, ) = outputs
            sdtype = node.inputs[1].type.dtype_specs()[1]
            fail = sub["fail"]
            params = sub["params"]
            return ("""
            assert (PyArray_NDIM(%(shp)s) == 1);
            npy_intp new_dims[%(params)s->ndim];
            PyArray_Dims newshape;
            newshape.ptr = new_dims;
            newshape.len = %(params)s->ndim;
            for (int ii = 0; ii < %(params)s->ndim; ++ii)
            {
                // -- We do not want an explicit cast here. the shp can be any
                // -- int* dtype. The compiler will explicitly upcast it, but
                // -- will err if this will downcast. This could happen if the
                // -- user pass an int64 dtype, but npy_intp endup being int32.
                new_dims[ii] = ((%(sdtype)s*)(
                        PyArray_BYTES(%(shp)s) +
                        ii * PyArray_STRIDES(%(shp)s)[0]))[0];
            }
            Py_XDECREF(%(z)s);
            %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape, NPY_CORDER);
            if (!%(z)s)
            {
                //The error message should have been set by PyArray_Newshape
                %(fail)s;
            }
            """ % locals())
        else:
            raise NotImplementedError()
Пример #12
0
 def test_hash_and_eq_params(self):
     wp1 = ParamsType(
         a=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     wp2 = ParamsType(
         a=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     w1 = Params(
         wp1,
         a=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     w2 = Params(
         wp2,
         a=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 == w2
     assert not (w1 != w2)
     assert hash(w1) == hash(w2)
     # Changing attributes names only (a -> other_name).
     wp2_other = ParamsType(
         other_name=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     w2 = Params(
         wp2_other,
         other_name=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
     # Changing attributes values only (now a=2).
     w2 = Params(
         wp2,
         a=2,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
     # Changing NumPy array values (5 -> -5).
     w2 = Params(
         wp2,
         a=1,
         array=np.asarray([1, 2, 4, -5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
Пример #13
0
class QuadraticOpFunc(COp):
    __props__ = ("a", "b", "c")
    params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type)

    def __init__(self, a, b, c):
        self.a = a
        self.b = b
        self.c = c

    def make_node(self, x):
        x = at.as_tensor_variable(x)
        return Apply(self, [x], [x.type()])

    def perform(self, node, inputs, output_storage, coefficients):
        x = inputs[0]
        y = output_storage[0]
        y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c

    def c_code_cache_version(self):
        return (1, 5)

    def c_support_code_apply(self, node, name):
        float_type = node.inputs[0].type.dtype_specs()[1]
        return """
        /* Computes: x = a*x*x + b*x + c for x in tensor. */
        int quadratic_%(name)s(PyArrayObject* tensor, %(float_type)s a, %(float_type)s b, %(float_type)s c) {
            NpyIter* iterator = NpyIter_New(tensor,
                NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
                NPY_KEEPORDER, NPY_NO_CASTING, NULL);
            if(iterator == NULL) {
                PyErr_SetString(PyExc_RuntimeError, "Unable to iterate over a tensor for an elemwise operation.");
                return -1;
            }
            NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
            char** data_ptr = NpyIter_GetDataPtrArray(iterator);
            npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
            npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
            do {
                char* data = *data_ptr;
                npy_intp stride = *stride_ptr;
                npy_intp count = *innersize_ptr;
                while(count) {
                    %(float_type)s x = *((%(float_type)s*)data);
                    *((%(float_type)s*)data) = a*x*x + b*x + c;
                    data += stride;
                    --count;
                }
            } while(get_next(iterator));
            NpyIter_Deallocate(iterator);
            return 0;
        }
        """ % {
            "name": name,
            "float_type": float_type,
        }

    def c_code(self, node, name, inputs, outputs, sub):
        return """
        %(float_type)s a = (%(float_type)s) (*(npy_float64*) PyArray_GETPTR1(%(coeff)s->a, 0)); // 0-D TensorType.
        %(float_type)s b =                                                   %(coeff)s->b;      // Scalar.
        %(float_type)s c =                 (%(float_type)s) PyFloat_AsDouble(%(coeff)s->c);     // Generic.
        Py_XDECREF(%(Y)s);
        %(Y)s = (PyArrayObject*)PyArray_EMPTY(PyArray_NDIM(%(X)s), PyArray_DIMS(%(X)s), PyArray_TYPE(%(X)s), PyArray_IS_F_CONTIGUOUS(%(X)s));
        if (PyArray_CopyInto(%(Y)s, %(X)s) != 0) {
            PyErr_SetString(PyExc_RuntimeError, "Unable to copy input into output.");
            %(fail)s
        };
        if (quadratic_%(name)s(%(Y)s, a, b, c) != 0) {
            PyErr_SetString(PyExc_RuntimeError, "Unable to compute quadratic function.");
            %(fail)s
        }
        """ % dict(
            name=name,
            coeff=sub["params"],
            fail=sub["fail"],
            X=inputs[0],
            Y=outputs[0],
            float_type=node.inputs[0].type.c_element_type(),
        )
Пример #14
0
class GpuMagmaSVD(GpuMagmaBase):
    """Computes the svd of a matrix :math:`A` using magma library.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``S, U, VT``. Use :func:`aesara.gpuarray.linalg.gpu_svd`
        to get them in expected order ``U, S, VT``.

    """

    __props__ = ("full_matrices", "compute_uv")
    _cop_num_inputs = 1
    _cop_num_outputs = 3
    check_input = False
    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
        ExternalCOp.__init__(self, ["c_code/magma_svd.c"],
                             "APPLY_SPECIFIC(magma_svd)")

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        if self.compute_uv:
            return Apply(
                self,
                [A],
                # return S, U, VT
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)(),
                    A.type(),
                    A.type(),
                ],
            )
        else:
            return Apply(
                self,
                [A],
                # return only S
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)()
                ],
            )

    def prepare_node(self, node, storage_map, compute_map, impl):
        super().prepare_node(node, storage_map, compute_map, impl)
        # Check node to prevent eventual errors with old pickled nodes.
        if self.compute_uv:
            A, B, C = node.outputs
            # We expect order: S (vector), U (matrix), VT (matrix)
            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, (
                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) "
                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
            )

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def infer_shape(self, fgraph, node, shapes):
        (x_shape, ) = shapes
        M, N = x_shape
        K = tm.minimum(M, N)
        s_shape = (K, )
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]
Пример #15
0
class GpuMagmaEigh(GpuMagmaBase):
    """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
    library.

    Parameters
    ----------
    UPLO : Specifies whether the calculation is done with the lower triangular
           part of matrix (`L`, default) or the upper triangular part (`U`).
    compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
                default). If `False`, computes only eigenvalues of matrix.
    """

    __props__ = ("lower", "compute_v")
    _cop_num_inputs = 1
    _cop_num_outputs = 2
    check_input = False
    params_type = ParamsType(lower=bool_t,
                             compute_v=bool_t,
                             context=gpu_context_type)

    def __init__(self, UPLO="L", compute_v=True):
        assert UPLO in ["L", "U"]
        self.lower = UPLO == "L"
        self.compute_v = compute_v
        ExternalCOp.__init__(self, ["c_code/magma_eigh.c"],
                             "APPLY_SPECIFIC(magma_eigh)")

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        if self.compute_v:
            return Apply(
                self,
                [A],
                # return D, V
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)(),
                    A.type(),
                ],
            )
        else:
            return Apply(
                self,
                [A],
                # return D
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)()
                ],
            )

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)
Пример #16
0
    def test_params_type_filtering(self):
        shape_tensor5 = (1, 2, 2, 3, 2)
        size_tensor5 = (shape_tensor5[0] * shape_tensor5[1] *
                        shape_tensor5[2] * shape_tensor5[3] * shape_tensor5[4])
        random_tensor = np.random.normal(
            size=size_tensor5).reshape(shape_tensor5)

        w = ParamsType(
            a1=TensorType("int32", (False, False)),
            a2=TensorType("float64", (False, False, False, False, False)),
            a3=Generic(),
        )

        # With a value that does not match the params type.
        o = Params(
            w,
            a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11,
                                                12]]).astype("int64"),
            a2=random_tensor.astype("float32"),
            a3=2000,
        )
        # should fail (o.a1 is not int32, o.a2 is not float64)
        with pytest.raises(TypeError):
            w.filter(o, True)
        # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed)
        with pytest.raises(TypeError):
            w.filter(o, False, False)
        # Should pass.
        w.filter(o, strict=False, allow_downcast=True)

        # With a value that matches the params type.
        o1 = Params(
            w,
            a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11,
                                                12]]).astype("int32"),
            a2=random_tensor.astype("float64"),
            a3=2000,
        )
        # All should pass.
        w.filter(o1, strict=True)
        w.filter(o1, strict=False, allow_downcast=False)
        w.filter(o1, strict=False, allow_downcast=True)

        # Check values_eq and values_eq_approx.
        o2 = Params(
            w,
            a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11,
                                                12]]).astype("int32"),
            a2=random_tensor.astype("float64"),
            a3=2000,
        )
        assert w.values_eq(o1, o2)
        assert w.values_eq_approx(o1, o2)

        # Check value_eq_approx.
        # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx.
        # So, I just play a little with float values.
        o3 = Params(
            w,
            a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11,
                                                12]]).astype("int32"),
            a2=(random_tensor.astype("float32") * 10 / 2.2 * 2.19999999999 /
                10).astype("float64"),
            a3=2000.0 - 0.00000000000000001,
        )
        assert w.values_eq_approx(o1, o3)
Пример #17
0
class GpuMaxPoolRop(CGpuKernelBase):
    """
    Implements the R-operator for the downsample operation.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        self.mode = mode
        CGpuKernelBase.__init__(self, ["c_code/pool_max_rop.c"],
                                "APPLY_SPECIFIC(max_pool_rop)")
        assert mode == "max"
        assert ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_headers(self, **kwargs):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, eval_point, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        eval_point = as_gpuarray_variable(eval_point, ctx_name)
        assert eval_point.ndim == nd + 2

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.ignore_border:
                raise ValueError("Padding works only with ignore_border=True")
            if isinstance(ws, (tuple, list)):
                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise ValueError("Padding must be smaller than strides")

        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, eval_point, ws, stride, pad],
                     [eval_point.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
        shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride, pad,
                             self.ndim)
        return [shp]
Пример #18
0
class GpuPool(CGpuKernelBase):
    """
    Implement the max and average pooling on the gpu.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(ignore_border=bool_t,
                             mode=PoolingMode_t,
                             context=gpu_context_type)

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode == "average":
            mode = "average_inc_pad"
        self.mode = mode
        CGpuKernelBase.__init__(self, ["c_code/pool.c"],
                                "APPLY_SPECIFIC(pool)")
        assert PoolingMode_t.has_alias(self.mode)
        assert self.ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_headers(self, **kwargs):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp)
        inp = as_gpuarray_variable(inp, ctx_name)
        nd = self.ndim
        assert inp.ndim == nd + 2
        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.ignore_border:
                raise ValueError("Padding works only with ignore_border=True")
            if isinstance(ws, (tuple, list)):
                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise ValueError("Padding must be smaller than strides")

        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, ws, stride, pad], [inp.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
        shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride, pad,
                             self.ndim)
        return [shp]

    def grad(self, inp, grads):
        img, ws, stride, pad = inp
        (grad, ) = grads

        grad = gpu_contiguous(grad)

        disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]]
        if self.mode == "max":
            out = self(img, ws, stride, pad)
            g_out = GpuMaxPoolGrad(ndim=self.ndim,
                                   ignore_border=self.ignore_border)(img, out,
                                                                     grad, ws,
                                                                     stride,
                                                                     pad)
            return [g_out] + disc
        else:
            g_out = GpuAveragePoolGrad(ndim=self.ndim,
                                       ignore_border=self.ignore_border,
                                       mode=self.mode)(img, grad, ws, stride,
                                                       pad)
            return [g_out] + disc

    def connection_pattern(self, node):
        return [[1], [0], [0], [0]]

    def R_op(self, inputs, eval_points):
        if self.mode != "max":
            # Rop for average or sum is simply pooling evaluated at eval point
            eval_inputs = [eval_points[0]] + inputs[1:]
            return [self(*eval_inputs)]

        # R_op can receive None as eval_points.
        # That mean there is no diferientiable path through that input
        # If this imply that you cannot compute some outputs,
        # return None for those.
        if eval_points[0] is None:
            return [None]
        z = self(*inputs)
        x, ws, stride, pad = inputs
        return [
            GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode,
                                           self.ndim)(x, z, eval_points[0], ws,
                                                      stride, pad)
        ]
Пример #19
0
class CumOp(COp):
    # See function cumsum/cumprod for docstring

    __props__ = ("axis", "mode")
    check_input = False
    params_type = ParamsType(c_axis=int_t,
                             mode=EnumList(("MODE_ADD", "add"),
                                           ("MODE_MUL", "mul")))

    def __init__(self, axis=None, mode="add"):
        if mode not in ("add", "mul"):
            raise ValueError(f'{type(self).__name__}: Unknown mode "{mode}"')
        self.axis = axis
        self.mode = mode

    c_axis = property(lambda self: np.MAXDIMS
                      if self.axis is None else self.axis)

    def make_node(self, x):
        x = aet.as_tensor_variable(x)
        out_type = x.type()

        if self.axis is None:
            out_type = vector(dtype=x.dtype)  # Flatten
        elif self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError(f"axis(={self.axis}) out of bounds")

        return Apply(self, [x], [out_type])

    def perform(self, node, inputs, output_storage, params):
        x = inputs[0]
        z = output_storage[0]
        if self.mode == "add":
            z[0] = np.cumsum(x, axis=self.axis)
        else:
            z[0] = np.cumprod(x, axis=self.axis)

    def grad(self, inputs, output_gradients):
        (x, ) = inputs
        (gi, ) = output_gradients

        if self.axis is None:
            if self.mode == "add":
                return [cumsum(gi[::-1])[::-1].reshape(x.shape)]
            elif self.mode == "mul":
                fx = cumprod(x, axis=self.axis)
                return [cumsum((fx * gi)[::-1])[::-1].reshape(x.shape) / x]
            else:
                raise NotImplementedError(
                    f'{type(self).__name__}: unknown gradient for mode "{self.mode}"'
                )

        reverse_slicing = [slice(None, None, None)] * gi.ndim
        reverse_slicing[self.axis] = slice(None, None, -1)
        reverse_slicing = tuple(reverse_slicing)
        # We need to reverse the gradients along ``self.axis``,
        #  compute cumsum, then reverse again
        if self.mode == "add":
            return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
        elif self.mode == "mul":
            fx = cumprod(x, axis=self.axis)
            return [
                cumsum(
                    (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x
            ]
        else:
            raise NotImplementedError(
                f'{type(self).__name__}: unknown gradient for mode "{self.mode}"'
            )

    def infer_shape(self, fgraph, node, shapes):
        if self.axis is None:
            return [(prod(shapes[0]), )]  # Flatten

        return shapes

    def c_code(self, node, name, inames, onames, sub):
        (x, ) = inames
        (z, ) = onames
        axis = self.axis
        fail = sub["fail"]
        params = sub["params"]

        code = ("""
                int axis = %(params)s->c_axis;
                if (axis == 0 && PyArray_NDIM(%(x)s) == 1)
                    axis = NPY_MAXDIMS;
                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
                if(axis == NPY_MAXDIMS && !(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
                }

                else if(axis != NPY_MAXDIMS && !(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s))))
                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE(%(x)s));
                }

                if (!%(z)s)
                    %(fail)s;
                {

                    PyObject * t = NULL;
                    if(%(params)s->mode == MODE_ADD)
                        t = PyArray_CumSum(
                            %(x)s, axis,
                            PyArray_TYPE(%(x)s), %(z)s);
                    else if(%(params)s->mode == MODE_MUL)
                        t = PyArray_CumProd(
                            %(x)s, axis,
                            PyArray_TYPE(%(x)s), %(z)s);

                    if (!t){
                       %(fail)s;
                    }
                    // Because PyArray_CumSum/CumProd returns a newly created reference on t.
                    Py_XDECREF(t);
                }
            """ % locals())

        return code

    def c_code_cache_version(self):
        return (8, )

    def __str__(self):
        return f"{self.__class__.__name__}{{{self.axis}, {self.mode}}}"
Пример #20
0
 def params_type(self):
     return ParamsType(i=aesara.scalar.basic.int64)
Пример #21
0
class BaseCorrMM(OpenMPOp, _NoPythonOp):
    """
    Base class for `CorrMM`, `CorrMM_gradWeights` and
    `CorrMM_gradInputs`. Cannot be used directly.

    Every sub-class must define internal attribute ``_direction`` out of __init__().
    ``_direction`` must take one of following values:

     - "forward" to correlate bottom with weights and store results in top.
     - "backprop weights" to do a valid convolution of bottom with top
       (swapping the first two dimensions) and store results in weights.
     - "backprop inputs" to do a full convolution of top with weights
       (swapping the first two dimensions) and store results in bottom.

    Parameters
    ----------
    border_mode : {'valid', 'full', 'half'}
        Additionally, the padding size could be directly specified by an integer,
        a pair of integers, or two pairs of integers.
    subsample
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
        Perform dilated correlation (default: (1,1))
    num_groups
        Perform grouped convolutions (default: 1)
    unshared
        Perform unshared correlation (default: False)
    """

    check_broadcast = False
    __props__ = (
        "border_mode",
        "subsample",
        "filter_dilation",
        "num_groups",
        "unshared",
    )

    _direction: Optional[str] = None

    params_type = ParamsType(
        direction=EnumList(
            ("DIRECTION_FORWARD", "forward"),  # 0
            ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"),  # 1
            ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"),
        ),  # 2
        dH=int64,
        dW=int64,
        dilH=int64,
        dilW=int64,
        padH_l=int64,
        padH_r=int64,
        padW_l=int64,
        padW_r=int64,
        num_groups=int64,
        unshared=int8,
    )

    def __init__(
            self,
            border_mode="valid",
            subsample=(1, 1),
            filter_dilation=(1, 1),
            num_groups=1,
            unshared=False,
            openmp=None,
    ):
        super().__init__(openmp=openmp)
        if isinstance(border_mode, int):
            if border_mode < 0:
                raise ValueError("invalid border_mode {}, which must be a "
                                 "non-negative integer".format(border_mode))
            border_mode = ((border_mode, border_mode), ) * 2
        elif isinstance(border_mode, tuple):
            if len(border_mode) != 2:
                raise ValueError("invalid border_mode {} which must be a "
                                 "tuple of length 2".format(border_mode))
            border = ()
            for mode in border_mode:
                if isinstance(mode,
                              tuple) and len(mode) == 2 and min(mode) >= 0:
                    border += ((int(mode[0]), int(mode[1])), )
                elif mode >= 0:
                    border += ((int(mode), int(mode)), )
                else:
                    raise ValueError(
                        "invalid border mode {}. The tuple can only contain "
                        "integers or tuples of length 2".format(border_mode))
            border_mode = border
        elif border_mode not in ("valid", "full", "half"):
            raise ValueError(
                "invalid border_mode {}, which must be either "
                '"valid", "full", "half", an integer or a tuple '
                "of two integers or a pair of integers".format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
        if len(filter_dilation) != 2:
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
        self.unshared = unshared

        if not config.blas__ldflags:
            # Aesara will use a NumPy C implementation of [sd]gemm_ instead.
            self.blas_type = ""
        else:
            if "openblas" in config.blas__ldflags:
                self.blas_type = "openblas"
            elif "mkl" in config.blas__ldflags:
                self.blas_type = "mkl"
            else:
                self.blas_type = ""

        if self._direction not in [
                "forward", "backprop weights", "backprop inputs"
        ]:
            raise ValueError("_direction must be one of 'forward', "
                             "'backprop weights', 'backprop inputs'")
        if num_groups < 1:
            raise ValueError("Number of groups should be greater than 0")
        self.num_groups = num_groups

    @property
    def pad(self):
        if self.border_mode == "half":
            return ((-1, -1), ) * 2
        elif self.border_mode == "full":
            return ((-2, -2), ) * 2
        elif isinstance(self.border_mode, tuple):
            return self.border_mode
        else:
            assert self.border_mode == "valid"
            return ((0, 0), ) * 2

    # Direction should be converted to real enum value,
    # as it is compared to integer later in c_code_helper().
    direction = property(
        lambda self: self.params_type.enum_from_alias(self._direction))

    dH = property(lambda self: self.subsample[0])
    dW = property(lambda self: self.subsample[1])

    dilH = property(lambda self: self.filter_dilation[0])
    dilW = property(lambda self: self.filter_dilation[1])

    padH_l = property(lambda self: self.pad[0][0])
    padH_r = property(lambda self: self.pad[0][1])
    padW_l = property(lambda self: self.pad[1][0])
    padW_r = property(lambda self: self.pad[1][1])

    def __str__(self):
        return "{}{{{}, {}, {}, {} {}}}".format(
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
            str(self.num_groups),
            str(self.unshared),
        )

    @staticmethod
    def as_common_dtype(in1, in2):
        """
        Upcast input variables if necessary.
        """
        dtype = aesara.scalar.upcast(in1.dtype, in2.dtype)
        return in1.astype(dtype), in2.astype(dtype)

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, "num_groups"):
            self.num_groups = 1

    def c_support_code(self, **kwargs):
        ccodes = blas_headers.blas_header_text()
        if self.blas_type == "openblas":
            ccodes += blas_headers.openblas_threads_text()
        elif self.blas_type == "mkl":
            ccodes += blas_headers.mkl_threads_text()
        return ccodes

    def c_libraries(self, **kwargs):
        return ldflags()

    def c_compile_args(self, **kwargs):
        compile_args = ldflags(libs=False, flags=True)
        compile_args += super().c_compile_args(**kwargs)
        return compile_args

    def c_lib_dirs(self, **kwargs):
        return ldflags(libs=False, libs_dir=True)

    def c_header_dirs(self, **kwargs):
        return ldflags(libs=False, include_dir=True)

    def c_headers(self, **kwargs):
        headers = ["<stdio.h>"]
        headers += super().c_headers(**kwargs)
        return headers

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (10, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
        # these files
        sub = {}
        dtype = str(node.__dict__["inputs"][0].dtype)
        assert dtype in ("float32", "float64")
        if dtype == "float32":
            sub["gemm"] = "sgemm_"
            sub["gemv"] = "sgemv_"
            sub["float_type"] = "npy_float"
            sub["float_typenum"] = "NPY_FLOAT"
            sub["n_bytes"] = 4
            sub["c_float_type"] = "float"
        else:
            sub["gemm"] = "dgemm_"
            sub["gemv"] = "dgemv_"
            sub["float_type"] = "npy_double"
            sub["float_typenum"] = "NPY_DOUBLE"
            sub["n_bytes"] = 8
            sub["c_float_type"] = "double"

        if self.openmp:
            sub["omp_flags"] = "#pragma omp parallel for schedule(static)"
            sub["omp_get_max_threads"] = "omp_get_max_threads()"
            sub["omp_get_thread_num"] = "omp_get_thread_num()"

            if self.blas_type == "openblas":
                sub["blas_set_num_threads"] = "openblas_set_num_threads"
                sub["blas_get_num_threads"] = "openblas_get_num_threads()"
            elif self.blas_type == "mkl":
                sub["blas_set_num_threads"] = "mkl_set_num_threads"
                sub["blas_get_num_threads"] = "mkl_get_max_threads()"
            else:
                sub["blas_set_num_threads"] = ""
                sub["blas_get_num_threads"] = "0"
        else:
            sub["omp_flags"] = ""
            sub["omp_get_max_threads"] = "1"
            sub["omp_get_thread_num"] = "0"
            sub["blas_set_num_threads"] = ""
            sub["blas_get_num_threads"] = "0"

        final_code = ""
        with open(
                os.path.join(
                    os.path.split(__file__)[0],
                    os.path.join("c_code", "corr_gemm.c"))) as f:
            code = f.read()
            final_code += code
        return final_code % sub

    def c_code_helper(self,
                      bottom,
                      weights,
                      top,
                      sub,
                      height=None,
                      width=None):
        """
        This generates the C code for CorrMM (direction="forward"),
        CorrMM_gradWeights (direction="backprop weights"), and
        CorrMM_gradInputs (direction="backprop inputs").
        Depending on the direction, one of bottom, weights, top will
        receive the output, while the other two serve as inputs.

        :param bottom: Variable name of the input images in the forward pass,
            or the gradient of the input images in backprop wrt. inputs
        :param weights: Variable name of the filters in the forward pass,
            or the gradient of the filters in backprop wrt. weights
        :param top: Variable name of the output images / feature maps in the
            forward pass, or the gradient of the outputs in the backprop passes
        :param sub: Dictionary of substitutions useable to help generating the
            C code.
        :param height: If self.subsample[0] != 1, a variable giving the height
            of the filters for direction="backprop weights" or the height of
            the input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the height of the
            filters for direction="backprop weights".  Ignored otherwise.
        :param width: If self.subsample[1] != 1, a variable giving the width
            of the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the width of the
            filters for direction="backprop weights".  Ignored otherwise.
        """

        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
        if height:
            height = f"(*(npy_int64 *)(PyArray_DATA({height})))"
        else:
            if ((self.direction != 0) and
                (self.dH != 1)) or ((self.direction == 1) and
                                    (self.padH_l == -1 or self.padH_r == -1)):
                raise ValueError(
                    "height must be given for backprop with vertical sampling or border_mode='half'"
                )
            height = "-1"
        if width:
            width = f"(*(npy_int64 *)(PyArray_DATA({width})))"
        else:
            if ((self.direction != 0) and
                (self.dW != 1)) or ((self.direction == 1) and
                                    (self.padW_l == -1 or self.padW_r == -1)):
                raise ValueError(
                    "width must be given for backprop with horizontal sampling or border_mode='half'"
                )
            width = "-1"

        return """
    // Mandatory args
    int direction = %(params)s->direction;  // forward, bprop weights, bprop inputs

    // Optional args
    int dH = %(params)s->dH;
    int dW = %(params)s->dW;
    int dilH = %(params)s->dilH;
    int dilW = %(params)s->dilW;
    int padH_l = %(params)s->padH_l;
    int padH_r = %(params)s->padH_r;
    int padW_l = %(params)s->padW_l;
    int padW_r = %(params)s->padW_r;
    int numgroups = %(params)s->num_groups;
    int unshared = %(params)s->unshared;

    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
    PyArrayObject * top = %(top)s;
    PyArrayObject * out2 = NULL;
    PyArrayObject **out = NULL;

    switch(%(params)s->direction) {
        case DIRECTION_FORWARD:
            out = &%(top)s;
            break;
        case DIRECTION_BACKPROP_WEIGHTS:
            out = &%(weights)s;
            break;
        case DIRECTION_BACKPROP_INPUTS:
            out = &%(bottom)s;
            break;
        default:
            PyErr_SetString(PyExc_ValueError, "CPU CorrMM: Invalid direction.");
            {%(fail)s}
            break;
    }

    int wdim, odim;
    wdim = unshared ? 6 : 4;
    odim = 4; //Can be set to 6 later for unshared backprop wrt weights

    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[wdim-2];
        kW = PyArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH_l == -2 || padH_r == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if (%(width)s != -1) {
            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
        else if (padW_l == -2 || padW_r == -2) {
            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
            kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

    // Implicit dilated kernel size
    dil_kH = (kH - 1) * dilH + 1;
    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
        padH_l = padH_r = dil_kH / 2;
    }
    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
        padH_l = padH_r = dil_kH - 1;
    }
    else if (padH_l < -2 || padH_r < -2) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2");
        %(fail)s
    }
    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
        padW_l = padW_r = dil_kW / 2;
    }
    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
        padW_l = padW_r = dil_kW - 1;
    }
    else if (padW_l < -2 || padW_r < -2) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2");
        %(fail)s
    }

    // Infer output shape
    npy_intp out_dim[6];
    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
            if (unshared) {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3]);
            }
            else {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3]);
            }
            %(fail)s
        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        if (unshared){
            odim = 6;
            out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
            out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
        }
        out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
        out_dim[wdim-2] = (npy_intp)kH;  // already inferred further above
        out_dim[wdim-1] = (npy_intp)kW;  // how convenient
        if (unshared) {
            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
                    || out_dim[4] <= 0 || out_dim[5] <= 0){
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. weights: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        else {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. weights: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r);
        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r);
        if (unshared) {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        else {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }

    // Prepare output array
    int typenum;
    int failure;
    failure = !(*out
           && PyArray_NDIM(*out)==odim
           && PyArray_IS_C_CONTIGUOUS(*out)
           && PyArray_DIMS(*out)[0]==out_dim[0]
           && PyArray_DIMS(*out)[1]==out_dim[1]
           && PyArray_DIMS(*out)[2]==out_dim[2]
           && PyArray_DIMS(*out)[3]==out_dim[3]);
    if (odim == 6){
        failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
                && PyArray_DIMS(*out)[5]==out_dim[5]);
    }
    if ( failure )
    {
        Py_XDECREF(*out);
        if (direction != 1) {
          typenum = PyArray_TYPE(weights);
        }
        else {
          typenum = PyArray_TYPE(bottom);
        }
        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
        *out = (PyArrayObject*)PyArray_ZEROS(odim,
                                          out_dim,
                                          typenum,
                                          0);
        if (NULL == *out)
        {
            if (odim == 4) {
                PyErr_Format(PyExc_RuntimeError,
                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
            }
            if (odim == 6) {
                PyErr_Format(PyExc_RuntimeError,
                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
                        (long long)out_dim[4], (long long)out_dim[5]);
            }
            %(fail)s
        }
    }

    // Call corrMM code
    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
    assert (out2 == *out);

""" % dict(
            bottom=bottom,
            weights=weights,
            top=top,
            height=height,
            width=width,
            fail=sub["fail"],
            params=sub["params"],
        )
Пример #22
0
class CheckAndRaise(COp):
    """An `Op` that checks conditions and raises an exception if they fail.

    This `Op` returns its "value" argument if its condition arguments are all
    ``True``; otherwise, it raises a user-specified exception.

    """

    _f16_ok = True
    __props__ = ("msg", "exc_type")
    view_map = {0: [0]}

    check_input = False
    params_type = ParamsType(exc_type=exception_type)

    def __init__(self, exc_type, msg=""):

        if not issubclass(exc_type, Exception):
            raise ValueError("`exc_type` must be an Exception subclass")

        self.exc_type = exc_type
        self.msg = msg

    def __str__(self):
        return f"CheckAndRaise{{{self.exc_type}({self.msg})}}"

    def __eq__(self, other):
        if type(self) != type(other):
            return False

        if self.msg == other.msg and self.exc_type == other.exc_type:
            return True

        return False

    def __hash__(self):
        return hash((self.msg, self.exc_type))

    def make_node(self, value: Variable, *conds: Tuple[Variable]):
        """

        Parameters
        ==========
        value
            The value to return if `conds` all evaluate to ``True``; otherwise,
            `self.exc_type` is raised.
        conds
            The conditions to evaluate.
        """
        import aesara.tensor as at

        if not isinstance(value, Variable):
            value = at.as_tensor_variable(value)

        conds = [at.as_tensor_variable(c) for c in conds]

        assert all(c.type.ndim == 0 for c in conds)

        return Apply(
            self,
            [value] + conds,
            [value.type()],
        )

    def perform(self, node, inputs, outputs, params):
        (out, ) = outputs
        val, *conds = inputs
        out[0] = val
        if not np.all(conds):
            raise self.exc_type(self.msg)

    def grad(self, input, output_gradients):
        return output_gradients + [DisconnectedType()()] * (len(input) - 1)

    def connection_pattern(self, node):
        return [[1]] + [[0]] * (len(node.inputs) - 1)

    def c_code(self, node, name, inames, onames, props):
        value_name, *cond_names = inames
        out_name = onames[0]
        check = []
        fail_code = props["fail"]
        param_struct_name = props["params"]
        msg = self.msg.replace('"', '\\"').replace("\n", "\\n")
        for idx, cond_name in enumerate(cond_names):
            check.append(f"""
        if(PyObject_IsTrue((PyObject *){cond_name}) == 0) {{
            PyObject * exc_type = {param_struct_name}->exc_type;
            Py_INCREF(exc_type);
            PyErr_SetString(exc_type, "{msg}");
            Py_XDECREF(exc_type);
            {indent(fail_code, " " * 4)}
        }}
                """)
        check = "\n".join(check)
        res = f"""
        {check}
        Py_XDECREF({out_name});
        {out_name} = {value_name};
        Py_INCREF({value_name});
        """
        return res

    def c_code_cache_version(self):
        return (1, 0)

    def infer_shape(self, fgraph, node, input_shapes):
        return [input_shapes[0]]
Пример #23
0
class GpuImages2Neibs(GpuKernelBaseCOp, Images2Neibs, _NoPythonOp):
    """
    Images2Neibs for the GPU.

    """

    params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def make_node(self, ten4, neib_shape, neib_step=None):
        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = aet.as_tensor_variable(neib_shape)
        if neib_step is None:
            neib_step = neib_shape
        else:
            neib_step = aet.as_tensor_variable(neib_step)

        assert ten4.ndim == 4
        assert neib_shape.ndim == 1
        assert neib_step.ndim == 1
        assert neib_shape.dtype in integer_dtypes
        assert neib_step.dtype in integer_dtypes

        return Apply(
            self,
            [ten4, neib_shape, neib_step],
            [
                GpuArrayType(
                    broadcastable=(False, False),
                    dtype=ten4.type.dtype,
                    context_name=ten4.type.context_name,
                )()
            ],
        )

    def c_code_cache_version(self):
        return (14,)

    def c_headers(self, **kwargs):
        return ["<numpy_compat.h>", "<gpuarray/types.h>"]

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
        mode_constants = self.BORDER_MODE.c_support_code()
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """#include "cluda.h"

        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED){
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }""" % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
        )

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """#include "cluda.h"

        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }
        """ % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)
        )
        return kernels

    def c_support_code(self, **kwargs):
        return """
        template <typename T>
        static T ceil_intdiv(T a, T b)
        {
            return (a/b) + ((a % b) ? 1: 0);
        }
        """

    def c_code(self, node, name, inp, out, sub):
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: *fptr: %%s.",
                             GpuKernel_error(fptr, err));
                %(fail)s;
            }
        """ % dict(
            fail=sub["fail"]
        )

        # NB: To reduce C code variability:
        # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
        # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
        # For typecode_z, I use ten4->ga.typecode (for same reason as above)
        return """
        int grid_c = -1;
        int grid_d = -1;
        size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
        size_t itemsize_z = itemsize_ten4;
        int typecode_z = %(ten4)s->ga.typecode;

        {
            if (PyGpuArray_NDIM(%(ten4)s) != 4)
            {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs: pvals wrong rank");
                %(fail)s;
            }
            if (PyArray_NDIM(%(neib_shape)s) != 1)
            {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs: unis wrong rank");
                %(fail)s;
            }

            if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuImages2Neibs: neib_shape has to contain two"
                             " elements");
                %(fail)s;
            }

            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 0);
            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 1);
            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

            if (step_x <=0 || step_y <=0)
            {
                PyErr_Format(PyExc_ValueError,
                             "neib_step wrong step ; values <= 0. Got %%lld %%lld.",
                             (long long) step_x, (long long) step_y);
                %(fail)s;
            }

            if (c <=0 || d <=0)
            {
                PyErr_Format(PyExc_ValueError,
                             "neib_shape values <= 0. Got %%lld %%lld.",
                             (long long)c, (long long)d);
                %(fail)s;
            }

            if (%(params)s->mode == MODE_WRAP_CENTERED) {
                if (c%%2!=1 || d%%2!=1){
                    PyErr_Format(PyExc_TypeError,
                                 "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
                    %(fail)s;
                }
                if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
                     PyGpuArray_DIMS(%(ten4)s)[3] < d)
                {
                    PyErr_Format(PyExc_TypeError,
                                 "GpuImages2Neibs: in wrap_centered mode,"
                                 " don't support image shapes smaller then"
                                 " the patch shapes: neib_shape=(%%d,%%d),"
                                 " ten4[2:]=[%%d,%%d]",
                                 c, d, PyGpuArray_DIMS(%(ten4)s)[2],
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
                                     (size_t)step_x);
                grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
                                     (size_t)step_y);


            } else if (%(params)s->mode == MODE_VALID) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
                                 " ten4.shape[2]=%%d not consistent",
                                 c, step_x,
                                 PyGpuArray_DIMS(%(ten4)s)[2]);
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
                                 " ten4.shape[3]=%%d not consistent",
                                 d, step_y,
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
            } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
            } else if (%(params)s->mode == MODE_HALF) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
                                 " ten4.shape[2]=%%d not consistent",
                                 c, step_x,
                                 PyGpuArray_DIMS(%(ten4)s)[2]);
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
                                 " ten4.shape[3]=%%d not consistent",
                                 d, step_y,
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
            } else if (%(params)s->mode == MODE_FULL) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError,
                                 "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
                                 " ten4.shape[2]=%%ld not consistent",
                                 (long int)c, (long int)step_x,
                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[2]));
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError,
                                 "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
                                 " ten4.shape[3]=%%ld not consistent",
                                 (long int)d, (long int)step_y,
                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[3]));
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
            } else {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
                 %(fail)s;
            }

            // new dimensions for z
            const int z_dim1 = c * d;
            const int z_dim0 =  grid_c
                                * grid_d
                                * PyGpuArray_DIMS(%(ten4)s)[1]
                                * PyGpuArray_DIMS(%(ten4)s)[0];

            if ((NULL == %(z)s)
                || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0)
                || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1))
            {
                Py_XDECREF(%(z)s);
                size_t dims[2];
                dims[0] = z_dim0;
                dims[1] = z_dim1;
                %(z)s = pygpu_empty(2, dims, typecode_z,
                                    GA_C_ORDER, %(params)s->context, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
                                    " failed to alloc z output");
                    %(fail)s;
                }
            }

        }

        { // NESTED SCOPE

            const int mode = %(params)s->mode;
            const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
            const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
            const int height = PyGpuArray_DIMS(%(ten4)s)[2];
            const int width = PyGpuArray_DIMS(%(ten4)s)[3];

            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 0);
            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 1);
            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

            size_t threads_per_block[3] = {d, c, 1};
            //get the max threads per blocks
            size_t max_threads_dim;
            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
            if (err != GA_NO_ERROR){
                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
                %(fail)s;
            }
            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--;
            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--;

            //Make bigger block to have better memory access pattern and
            //a higher core utilisation. for smaller patch size

            while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
                  threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
                threads_per_block[2]++;
            }
            int nb_block;
            if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
                nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
            else
                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
            size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};

            GpuKernel *fptr;
            if(threads_per_block[0]==d && threads_per_block[1]==c){
                fptr = &k_multi_warp_less_%(name)s;
            }else{
                fptr = &k_multi_warp_%(name)s;
            }
            /*
            printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
                   max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
                   n_blocks[0], n_blocks[1], n_blocks[2]);
            */
            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
            void *kernel_params[] = {(void *)&mode,
                                     (void *)&nb_batch,
                                     (void *)&nb_stack,
                                     (void *)&height, (void *)&width,
                                     (void *)&c, (void *)&d,
                                     (void *)&step_x, (void *)&step_y,
                                     (void *)&grid_c, (void *)&grid_d,
                                     (void *)&stride_A0,
                                     (void *)&stride_A1,
                                     (void *)&stride_A2,
                                     (void *)&stride_A3,
                                     (void *)%(ten4)s->ga.data,
                                     (void *)&%(ten4)s->ga.offset,
                                     (void *)&stride_Z0,
                                     (void *)&stride_Z1,
                                     (void *)%(z)s->ga.data,
                                     (void *)&%(z)s->ga.offset};
            err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
        } // END NESTED SCOPE
        """ % dict(
            ten4=inp[0],
            neib_shape=inp[1],
            neib_step=inp[2],
            z=out[0],
            dtype_neib_shape=node.inputs[1].dtype,
            dtype_neib_step=node.inputs[2].dtype,
            err_check=err_check,
            name=name,
            params=sub["params"],
            fail=sub["fail"],
        )
Пример #24
0
class GpuAveragePoolGrad(CGpuKernelBase):
    """
    Implement the grad of average pooling on the gpu.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode == "average":
            mode = "average_inc_pad"
        self.mode = mode
        CGpuKernelBase.__init__(self, ["c_code/pool_ave_grad.c"],
                                "APPLY_SPECIFIC(ave_pool_grad)")
        assert mode in ("sum", "average_inc_pad", "average_exc_pad")
        assert ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_headers(self, **kwargs):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, out_grad, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp, out_grad)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
        assert out_grad.ndim == nd + 2

        assert out_grad.ndim == inp.ndim

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.mode == "average_exc_pad":
                raise ValueError("Padding must be zero for average_exc_pad")
        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        return [in_shapes[0]]

    def grad(self, inp, grads):
        x, gz, ws, stride, pad = inp
        (ggx, ) = grads
        return [
            aesara.tensor.zeros_like(x),
            GpuPool(ignore_border=self.ignore_border,
                    ndim=self.ndim,
                    mode=self.mode)(ggx, ws, stride, pad),
        ] + [aesara.gradient.DisconnectedType()() for i in inp[2:]]

    def connection_pattern(self, node):
        return [[1], [1], [0], [0], [0]]
Пример #25
0
class GpuSparseBlockGemv(_NoPythonExternalCOp):
    """
    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
    information.

    This should not be directly called since the interface is subject
    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
    function for a stable interface.
    """

    __props__ = ("inplace", )
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.

    def __init__(self, inplace=False):
        super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir()]

    def c_headers(self, **kwargs):
        return [
            "<gpuarray/buffer_blas.h>",
            "<gpuarray/buffer.h>",
            "<gpuarray_helper.h>",
        ]

    def make_node(self, o, W, h, inputIdx, outputIdx):
        ctx = infer_context_name(o, W, h)
        o = as_gpuarray_variable(o, ctx)
        W = as_gpuarray_variable(W, ctx)
        h = as_gpuarray_variable(h, ctx)
        inputIdx = as_tensor_variable(inputIdx)
        outputIdx = as_tensor_variable(outputIdx)
        assert o.ndim == 3
        assert W.ndim == 4
        assert h.ndim == 3
        assert inputIdx.ndim == 2
        assert outputIdx.ndim == 2

        assert inputIdx.type.dtype in discrete_dtypes
        assert outputIdx.type.dtype in discrete_dtypes

        return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])

    def infer_shape(self, fgraph, node, input_shapes):
        return [input_shapes[0]]

    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]

        Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx,
                                       outputIdx)
        hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle(
            (1, 0, 3, 2)), go, outputIdx, inputIdx)
        return [
            go,
            Wgrad,
            hgrad,
            grad_undefined(self, 3, inputIdx,
                           "grad of inputIdx makes no sense"),
            grad_undefined(self, 4, outputIdx,
                           "grad of outputIdx makes no sense"),
        ]
Пример #26
0
class GpuCumOp(GpuKernelBaseCOp, _NoPythonOp):
    """
    Parameters
    ----------
    axis
        Can not be None. If you want the array flattened, do it before.
    """

    SUPPORTED_NDIMS = 3
    __props__ = ("axis", "mode")
    params_type = ParamsType(axis=scalar.int32, context=gpu_context_type)

    def __init__(self, axis, mode="add"):
        assert axis is not None
        self.axis = int(axis)
        self.mode = mode

    def __eq__(self, other):
        if type(other) != type(self):
            return False
        return self.axis == other.axis and self.mode == other.mode

    def __hash__(self):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
        return (7, )

    def c_headers(self, **kwargs):
        return [
            "<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>"
        ]

    def c_header_dirs(self, **kwargs):
        return [gpuarray_helper_inc_dir()]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def make_node(self, x):
        assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp"

        context_name = infer_context_name(x)

        x = as_gpuarray_variable(x, context_name)

        if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
            raise NotImplementedError("Only cum op on 1D, 2D and\
                                       3D arrays are supported right now!")

        if self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError(f"axis(={self.axis}) out of bounds")
        return Apply(self, [x], [x.type()])

    def gpu_kernels(self, node, nodename):
        kernels = []
        # cumadd
        kname = "k_cumadd"
        op = {"mul": "*", "add": "+"}[self.mode]
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
                              ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
                              const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
            input = (float *)(((char *)input) + input_offset);
            output = (float *)(((char *)output) + output_offset);
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

            int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
            int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
            int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
            int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
            int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
            }
        """ % locals())
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "intc",
            "intc",
            "intc",
            "intc",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        # blockCumOp
        kname = "k_blockCumOp"
        k_var = "k_blockCumOp_" + nodename
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "int32",
            "int32",
            gpuarray.GpuArray,
            gpuarray.SIZE,
        ]
        code = ("""#include "cluda.h"

        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumOp) {
            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if (index < blockDim.x*2) {
                    partialCumOp[index] %(op)s= partialCumOp[index - stride];
                }
            }
        }

        WITHIN_KERNEL
        void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
                         ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                         int offsetY, int offsetZ) {
            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            partialCumOp[threadIdx.x*2]     = input[idx_even];
            partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
        }

        WITHIN_KERNEL
        void k_reversePhase(float* partialCumOp) {
            // Traverse back up the tree building the scan from the partial sums
            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if (index + stride < blockDim.x*2) {
                    partialCumOp[index + stride] %(op)s= partialCumOp[index];
                }
            }
        }

        WITHIN_KERNEL
        void k_pushData(float* partialCumOp, float* output, int globalThreadID,
                        ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                        int offsetY, int offsetZ) {
            local_barrier();
            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;
            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            output[idx_even] = partialCumOp[threadIdx.x*2];
            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
        }

        KERNEL void k_blockCumOp(float* input, ga_size input_offset,
                                 float* output, ga_size output_offset,
                                 size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
                                 ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
                                 ga_ssize outputStrides_x, ga_ssize outputStrides_y,
                                 ga_ssize outputStrides_z, int offsetY,
                                 int offsetZ, float* blockSum, ga_size blockSum_offset) {
            input = (float *)(((char *)input) + input_offset);
            output = (float *)(((char *)output) + output_offset);
            blockSum = (float *)(((char *)blockSum) + blockSum_offset);

            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.

            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;

            // Check if current thread has data to process.
            if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
                return;
            }

            extern __shared__ float partialCumOp[];

            // Load data in shared memory
            k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);

            // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
            // The tree is sweeped from the leaves to the root and from the root to the leaves.
            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
            k_reductionPhase(partialCumOp);
            k_reversePhase(partialCumOp);

            // Write the final output to global memory
            k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);

            if (blockSum != NULL){
                if (threadIdx.x == blockDim.x - 1) {
                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
                }
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        # k_finalCumOp
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
        code = ("""#include "cluda.h"

        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
                                 float* blockSum, ga_size blockSum_offset,
                                 size_t nbElementsPerCumOp,
                                 ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
                                 int offsetY, int offsetZ) {

            output = (float *)(((char *)output) + output_offset);
            blockSum = (float *)(((char *)blockSum) + blockSum_offset);

            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
            if (globalThreadID >= (nbElementsPerCumOp+1)/2)
                return;

            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

            const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];

            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            output[idx_even] %(op)s= currentBlockSum;
            output[idx_odd] %(op)s= currentBlockSum;
        }
        """ % locals())
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "int32",
            "int32",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels

    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b"cuda":
            raise NotImplementedError("cuda only")
        return """
            const size_t* shape = PyGpuArray_DIMS(%(x)s);
            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);

            int axis = %(params)s->axis;
            if (axis < 0) {
                // Convert negative axis to positive axis.
                axis += PyGpuArray_NDIM(%(x)s);
            }

            if (aesara_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
                                   %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
                %(fail)s;
            }

            { // Namespace for kernel calls //
                size_t max_threads_dim0;
                size_t max_grid_size1;
                size_t max_grid_size2;
                int err;
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
                    %(fail)s;
                }
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;
                }
                if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
                    %(fail)s;
                }
            }
        """ % dict(
            x=inp[0],
            z=out[0],
            nodename=nodename,
            fail=sub["fail"],
            params=sub["params"],
        )

    def c_support_code_struct(self, node, nodename):
        code = ("""

        int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
            size_t shape[3] = { 1, 1, 1 };
            ssize_t inputStrides_x;
            ssize_t inputStrides_y;
            ssize_t inputStrides_z;
            ssize_t outputStrides_x;
            ssize_t outputStrides_y;
            ssize_t outputStrides_z;
            switch (PyGpuArray_NDIM(input))
            {
            case 1:
                shape[0] = PyGpuArray_DIMS(input)[0];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                break;
            case 2:
                shape[0] = PyGpuArray_DIMS(input)[0];
                shape[1] = PyGpuArray_DIMS(input)[1];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
                break;
            case 3:
                shape[0] = PyGpuArray_DIMS(input)[0];
                shape[1] = PyGpuArray_DIMS(input)[1];
                shape[2] = PyGpuArray_DIMS(input)[2];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
                inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
                outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
                break;
            default:
                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
                return -1;
            }
            if (shape[axis] <= 1) {
                int err = pygpu_move(output, input);
                return err;
            }
            // Perform cum op on array of even size.
            size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
            // Determine how many elements can be processed in one block.
            size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
            // Determine how many blocks are needed in total.
            size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX);  // Nb. of blocks needed per cum op.
            size_t dimGridY;  // Nb. of independent cum ops (width).
            size_t dimGridZ;  // Nb. of independent cum ops (height).
            ssize_t tmp;
            switch (axis)
            {
            case 0:
                dimGridY = shape[1];
                dimGridZ = shape[2];
                break;
            case 1:
                dimGridY = shape[0];
                dimGridZ = shape[2];
                tmp = inputStrides_x;
                inputStrides_x = inputStrides_y;
                inputStrides_y = tmp;
                tmp = outputStrides_x;
                outputStrides_x = outputStrides_y;
                outputStrides_y = tmp;
                break;
            case 2:
                dimGridY = shape[1];
                dimGridZ = shape[0];

                tmp = inputStrides_x;
                inputStrides_x = inputStrides_z;
                inputStrides_z = tmp;

                tmp = outputStrides_x;
                outputStrides_x = outputStrides_z;
                outputStrides_z = tmp;

                break;
            default:
                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
                return -1;
            }

            const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
            PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
                                                           GA_C_ORDER, input->context, Py_None);
            if (deviceBlockSum == NULL){
                return -1;
            }
            // Perform `maxGridY`*`maxGridZ` cum ops in parallel.
            for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
                size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);

                for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
                    size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);

                    int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
                    if (err != GA_NO_ERROR){
                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
                        return -1;
                    }

                    if (dimGridX > 1) {
                        // Do a cum op over the blockSum (recursive).
                        if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
                            Py_DECREF(deviceBlockSum);
                            return -1;
                        }
                        // Since there are more than one block (i.e. `dimGridX > 1`)
                        //  report partial cum ops of previous blocks to subsequents ones.
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};

                        int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
                            return -1;
                        }
                    }
                    // If shape[axis] is odd, the last element is compute manually
                    if (shape[axis] != nbElementsPerCumOp){
                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {1, 1, 1};

                        int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
                            return -1;
                        }

                    }
                }
            }
            Py_XDECREF(deviceBlockSum);
            return 0;
        }
        """ % locals())
        return super().c_support_code_struct(node, nodename) + code