Пример #1
0
    def gpu_kernels(self, node, name):
        dim = get_scalar_constant_value(node.inputs[1])
        flags = Kernel.get_flags(node.inputs[0].dtype)

        def_macros, undef_macros = self._macros(node, name)
        hsup = (self._hash_support_code() + "\n" + self._lookup_code())

        knames = ["build_hash", "dedup", "find_valid"]
        kcodes = [
            "".join(
                open("%s%s%s.cu" %
                     (os.path.dirname(__file__), os.path.sep, kn)).readlines())
            for kn in knames
        ]
        kcodes = [
            "\n".join([def_macros, hsup, code, undef_macros])
            for code in kcodes
        ]
        kcodes = ["#include \"cluda.h\"\n" + code for code in kcodes]

        kparams = ([
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE, SIZE, SIZE
        ], [GpuArray, SIZE, GpuArray, SIZE,
            SIZE], [GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE])

        return [
            Kernel(code=kcode,
                   name="%s_%d" % (kname, dim),
                   params=kparams,
                   flags=flags)
            for kcode, kname, kparams in zip(kcodes, knames, kparams)
        ]
Пример #2
0
 def gpu_kernels(self, node, name):
     dtype_x = node.inputs[0].dtype
     type_x = gpuarray.dtype_to_ctype(dtype_x)
     dtype_y = node.outputs[0].dtype
     type_y = gpuarray.dtype_to_ctype(dtype_y)
     work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x))
     load_x = load_w(dtype_x)
     write_y = write_w(dtype_y)
     code = """
     #include "cluda.h"
     KERNEL void extract(const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_ssize k, ga_size l) {
         x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off);
         y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off);
         ga_ssize coff = max(k, (ga_ssize) 0);
         ga_ssize roff = -min(k, (ga_ssize) 0);
         ga_size index = GID_0 * LDIM_0 + LID_0;
         if (index < l) {
             %(work_x)s t = %(load_x)s(x[(index + roff) * stridesX0 + (index + coff) * stridesX1]);
             y[index * stridesY0] = %(write_y)s(t);
         }
     }""" % dict(type_x=type_x, type_y=type_y, work_x=work_x, load_x=load_x, write_y=write_y, name=name)
     return [Kernel(
             code=code, name="extract",
             params=[gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SIZE],
             flags=Kernel.get_flags(dtype_x, dtype_y),
             objvar='k_extract_' + name)]
Пример #3
0
 def gpu_kernels(self, node, name):
     dtype_d = node.inputs[0].dtype
     type_d = gpuarray.dtype_to_ctype(dtype_d)
     dtype_x = node.inputs[1].dtype
     type_x = gpuarray.dtype_to_ctype(dtype_x)
     dtype_y = node.outputs[0].dtype
     type_y = gpuarray.dtype_to_ctype(dtype_y)
     work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d))
     load_d = load_w(dtype_d)
     work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x))
     load_x = load_w(dtype_x)
     code = """
     #include "cluda.h"
     KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) {
         d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off);
         x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off);
         y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off);
         ga_size index = threadIdx.x + blockIdx.x * blockDim.x;
         if (index < lx) {
             ga_long a = 0;
             ga_long b = (ga_long)(ld - 1);
             %(work_d)s minval = %(load_d)s(d[a]);
             %(work_d)s maxval = %(load_d)s(d[b * stridesD0]);
             %(work_x)s val = %(load_x)s(x[index * stridesX0]);
             if (val > maxval) {
                 a = (ga_long)ld;
                 b = (ga_long)ld;
             } else if (val <= minval) {
                 a = 0;
                 b = 0;
             }
             while (b - a > 0) {
                 ga_long h = (b + a) / 2;
                 %(work_d)s t = %(load_d)s(d[h * stridesD0]);
                 if (val < t) {
                     b = h;
                 } else {
                     a = h + 1;
                 }
             }
             y[index * stridesY0] = b;
         }
     }""" % dict(type_d=type_d, type_x=type_x, type_y=type_y, work_d=work_d, load_d=load_d, work_x=work_x, load_x=load_x, name=name)
     return [Kernel(
             code=code, name="binsearchsorted",
             params=[gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE],
             flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y),
             objvar='k_binsearchsorted_' + name)]
Пример #4
0
    def gpu_kernels(self, node, name):
        dt = node.inputs[0].type
        code = """
KERNEL void doublek(GLOBAL_MEM %(ctype) *out,
                   GLOBAL_MEM const %(ctype)s *a,
                   ga_size n) {
  for (ga_size i = LID_0; i < n; i += LDIM_0) {
    out[i] = 2 * a[i];
  }
}
""" % dict(ctype=gpuarray.dtype_to_ctype(dt))
        return [
            Kernel(
                code=code,
                name="doublek",
                params=[gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE],
                flags=Kernel.get_flags(dt))
        ]
    def gpu_kernels(self, node, name):
        code = """
KERNEL void axpb(GLOBAL_MEM %(ctype)s *x, GLOBAL_MEM  %(ctype)s *z, ga_size n, ga_size m) {
    for (ga_size i = LID_0; i < n; i += LDIM_0) {
        for (ga_size j = LID_0; j < m; j += LDIM_0) {
            z[i*m + j] = %(write_a)s( 2 * x[i*m + j] );
        }
    }
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
            name=name,
            write_a=write_w(self.dtype))
        return [
            Kernel(code=code,
                   name="axpb",
                   params=[
                       gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
                       gpuarray.SIZE
                   ],
                   flags=Kernel.get_flags(self.dtype),
                   objvar='k_axpb_' + name)
        ]
Пример #6
0
    def gpu_kernels(self, node, name):
        rdim = get_scalar_constant_value(node.inputs[2])
        vdim = get_scalar_constant_value(node.inputs[3])

        flags = Kernel.get_flags(node.inputs[0].dtype, node.inputs[1].dtype)

        def_macros, undef_macros = self._macros(node, name)
        hsup = (GpuHashTable._hash_support_code() + "\n" +
                GpuHashTable._lookup_code())

        knames = ["splat", "blur", "slice"]
        kcodes = [
            "".join(
                open("%s%s%s.cu" %
                     (os.path.dirname(__file__), os.path.sep, kn)).readlines())
            for kn in knames
        ]
        kcodes = [
            "\n".join([def_macros, hsup, code, undef_macros])
            for code in kcodes
        ]
        kparams = ([
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE
        ], [
            GpuArray, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray,
            SIZE, SIZE, SIZE
        ], [
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE
        ])

        return [
            Kernel(code=kcode,
                   name="%s_%d_%d" % (kname, rdim, vdim),
                   params=kparams,
                   flags=flags)
            for kcode, kname, kparams in zip(kcodes, knames, kparams)
        ]
Пример #7
0
    def gpu_kernels(self, node, name):
        dt = node.inputs[0].type
        code = """
KERNEL void doublek(GLOBAL_MEM %(ctype) *out,
                   GLOBAL_MEM const %(ctype)s *a,
                   ga_size n) {
  for (ga_size i = LID_0; i < n; i += LDIM_0) {
    out[i] = 2 * a[i];
  }
}
""" % dict(ctype=gpuarray.dtype_to_ctype(dt))
        return [Kernel(code=code, name="doublek",
                       params=[gpuarray.GpuArray,
                               gpuarray.GpuArray,
                               gpuarray.SIZE],
                       flags=Kernel.get_flags(dt))]
Пример #8
0
    def gpu_kernels(self, node, nodename):
        CHARMAP = dict(int32='i', uint32='I',
                       int64='l', uint64='L',
                       float16='e', float32='f', float64='d')
        dtype_in = node.inputs[0].dtype
        dtype_out = node.outputs[0].dtype
        dtype_idx = node.inputs[1].dtype
        type_in = gpuarray.dtype_to_ctype(dtype_in)
        type_out = gpuarray.dtype_to_ctype(dtype_out)
        type_idx = gpuarray.dtype_to_ctype(dtype_idx)
        flags = Kernel.get_flags(dtype_in, dtype_out, dtype_idx)
        kname = "k_vector_select_fast"
        k_var = "k_vector_select_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_select_fast(const ga_size numRowsOut,
                                      const ga_size numColsOut,
                                      const ga_ssize stridesOut0,
                                      const ga_ssize stridesOut1,
                                      GLOBAL_MEM %(type_out)s *Out,
                                      const ga_size offset_Out,
                                      const ga_size numRowsIn,
                                      const ga_size numColsIn,
                                      const ga_ssize stridesIn0,
                                      const ga_ssize stridesIn1,
                                      GLOBAL_MEM %(type_in)s *In,
                                      const ga_size offset_In,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_idx)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      GLOBAL_MEM ga_int *err)
        {
             Out = (GLOBAL_MEM %(type_out)s *)(((GLOBAL_MEM char *)Out)+offset_Out);
             In = (GLOBAL_MEM %(type_in)s *)(((GLOBAL_MEM char *)In)+offset_In);
             indices_arr = (GLOBAL_MEM %(type_idx)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsIn; j += LDIM_0)
                  {
                      ga_ssize in_row = indices_arr[i * stridesIndices];
                      if (in_row < 0)
                          in_row += numRowsIn;
                      ga_ssize out_row = i;
                      if (in_row < numRowsIn && in_row >= 0) {
                        Out[(out_row * stridesOut0) + (j * stridesOut1)] = In[(in_row * stridesIn0) + (j * stridesIn1)];
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(type_in=type_in, type_out=type_out, type_idx=type_idx,
                   tc=CHARMAP[dtype_in])
        from pygpu.gpuarray import SIZE, SSIZE
        params = [
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
            SIZE, SSIZE, gpuarray.GpuArray, SIZE,
            gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
Пример #9
0
    def gpu_kernels(self, node, nodename):
        # We can't rely on numpy for this, it changes with the OS
        CHARMAP = dict(
            int32="i",
            uint32="I",
            int64="l",
            uint64="L",
            float16="e",
            float32="f",
            float64="d",
        )
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
        code = """#include "cluda.h"
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
                                      GLOBAL_MEM %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
                                      GLOBAL_MEM %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
                                      GLOBAL_MEM %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
                                      const ga_int set_instead_of_inc,
                                      GLOBAL_MEM ga_int *err)
        {
             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);

             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
                          x_row += numRowsX;
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
                        *err = 1;
                      }
                  }
             }
             return;
        }
        """ % dict(
            type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x]
        )
        from pygpu.gpuarray import SIZE, SSIZE

        params = [
            SIZE,
            SIZE,
            SSIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            SIZE,
            SIZE,
            SSIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            SIZE,
            SSIZE,
            gpuarray.GpuArray,
            SIZE,
            "int32",
            gpuarray.GpuArray,
        ]
        return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
Пример #10
0
    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
        mode_constants = self.BORDER_MODE.c_support_code()
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """#include "cluda.h"

        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED){
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }""" % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """#include "cluda.h"

        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }
        """ % dict(
            kname=kname,
            type_ten4=type_ten4,
            type_z=type_z,
            mode_constants=mode_constants,
        )
        params = [
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "intc",
            "uintp",
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
            "uintp",
            "uintp",
            gpuarray.GpuArray,
            "uintp",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels