Exemplo n.º 1
0
        def thunk():
            mask_idx = inputs[0][0]
            image = inputs[1][0]
            batch_size = min(mask_idx.shape[0], image.shape[0])
            assert shape_ok(mask_idx.shape)
            assert shape_ok(image.shape)
            mask_idx = to_gpuarray(mask_idx)
            image = to_gpuarray(image)
            s = mask_idx.shape[3]
            assert mask_idx.shape[2] == mask_idx.shape[3], \
                "height and width must be equal"

            sdata_shape = (3*len(MASK), batch_size, 1, s, s)
            self._sdata = pycuda_zeros(self._sdata, sdata_shape)
            blocks_max = 32
            blocks_s = min(blocks_max, s)
            grid_s = math.ceil(s / blocks_max)
            grid = (batch_size, grid_s, grid_s)
            block = (1, blocks_s, blocks_s)
            image_mask_split(mask_idx, image, np.int32(batch_size),
                             np.int32(s), self._sdata,
                             block=block, grid=grid)
            sdata_as_theano = to_cudandarray(self._sdata)
            m = len(MASK)
            outputs[0][0] = sdata_as_theano[:m]
            outputs[1][0] = sdata_as_theano[m:2*m]
            outputs[2][0] = sdata_as_theano[2*m:]
Exemplo n.º 2
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:],
                                   np.float32,
                                   np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 3
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = tuple(input_shape)

            # print 'FFT shapes:', input_shape, '->', output_shape
            # print 'Batch size:', input_shape[0]
            # print 'Core shape:', input_shape[1:-1]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(shape=input_shape[1:-1],  # Exclude batch dim and complex dim
                                   in_dtype=np.complex64,
                                   out_dtype=np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 4
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 5
0
        def thunk():
            mask_idx = inputs[0][0]
            image = inputs[1][0]
            batch_size = min(mask_idx.shape[0], image.shape[0])
            assert shape_ok(mask_idx.shape)
            assert shape_ok(image.shape)
            mask_idx = to_gpuarray(mask_idx)
            image = to_gpuarray(image)
            s = mask_idx.shape[3]
            assert mask_idx.shape[2] == mask_idx.shape[3], \
                "height and width must be equal"

            sdata_shape = (3 * len(MASK), batch_size, 1, s, s)
            self._sdata = pycuda_zeros(self._sdata, sdata_shape)
            blocks_max = 32
            blocks_s = min(blocks_max, s)
            grid_s = math.ceil(s / blocks_max)
            grid = (batch_size, grid_s, grid_s)
            block = (1, blocks_s, blocks_s)
            image_mask_split(mask_idx,
                             image,
                             np.int32(batch_size),
                             np.int32(s),
                             self._sdata,
                             block=block,
                             grid=grid)
            sdata_as_theano = to_cudandarray(self._sdata)
            m = len(MASK)
            outputs[0][0] = sdata_as_theano[:m]
            outputs[1][0] = sdata_as_theano[m:2 * m]
            outputs[2][0] = sdata_as_theano[2 * m:]
Exemplo n.º 6
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 7
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 8
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            output_shape = list(input_shape)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
            # extra dimension with length 2 for real/imag
            output_shape += [2]
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out scikits.cuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
                                   batch=input_shape[0])

            fft.fft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 9
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 10
0
        def thunk():
            input_shape = inputs[0][0].shape

            size = input_shape[1] # matrices to invert are (size x size)
            batch_size = input_shape[0]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != input_shape:
                z[0] = cuda.CudaNdarray.zeros(input_shape)
                pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), np.int32)
                info_alloc[0] = pycuda.gpuarray.zeros(batch_size, np.int32)

            input_pycuda = to_gpuarray(inputs[0][0])
            output_pycuda = to_gpuarray(z[0])
            pivot = pivot_alloc[0]
            info = info_alloc[0]

            # construct pointer arrays for batched operations
            input_arr = bptrs(input_pycuda)
            output_arr = bptrs(output_pycuda)

            if not self.destructive:
                input_pycuda = input_pycuda.copy() # to prevent destruction of the input

            handle = scikits.cuda.misc._global_cublas_handle

            # perform LU factorization
            cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size)
            # the LU factorization is now in input_pycuda (destructive operation!)

            # use factorization to perform inversion
            cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size, info.gpudata, batch_size)
Exemplo n.º 11
0
        def thunk():
            input_shape = inputs[0][0].shape

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
            output_shape[-1] = (output_shape[-1] - 1) * 2
            output_shape = tuple(output_shape)

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:],
                                   np.complex64,
                                   np.float32,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
Exemplo n.º 12
0
        def thunk():
            input_shape = inputs[0][0].shape
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = CudaNdarray.zeros(output_shape)

            input_pycuda = to_gpuarray(inputs[0][0])
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by scikits.cuda as a complex64
            # array instead.
            output_pycuda = to_gpuarray(z[0])

            # only initialise plan if necessary
            if plan[0] is None or plan_input_shape[0] != input_shape:
                plan_input_shape[0] = input_shape
                plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64,
                                   batch=output_shape[0])

            fft.ifft(input_pycuda, output_pycuda, plan[0])
            compute_map[node.outputs[0]][0] = True
Exemplo n.º 13
0
        def thunk():

            start = time()

            input_shape = inputs[0][0].shape

            size = input_shape[1]  # matrices to invert are (size x size)
            batch_size = input_shape[0]

            z = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != input_shape:
                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(input_shape)
                pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), numpy.int32)
                info_alloc[0] = pycuda.gpuarray.zeros(batch_size, numpy.int32)

            input_pycuda = to_gpuarray(inputs[0][0])
            output_pycuda = to_gpuarray(z[0])
            pivot = pivot_alloc[0]
            info = info_alloc[0]

            init = time()

            print('init time:{0}'.format(init - start))

            if not self.destructive:
                input_pycuda = input_pycuda.copy()  # to prevent destruction of the input

            # construct pointer arrays for batched operations
            input_arr = bptrs(input_pycuda)
            output_arr = bptrs(output_pycuda)

            alloc = time()

            print('allocation time:{0}'.format(alloc - init))

            handle = scikits.cuda.misc._global_cublas_handle

            # perform LU factorization
            cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size)
            # the LU factorization is now in input_pycuda (destructive operation!)

            LU = time()

            print('LU time:{0}'.format(LU - alloc))

            # use factorization to perform inversion
            cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size,
                                       info.gpudata, batch_size)
            # the inverted matrices are now in output_pycuda

            inv = time()

            print('inv time:{0}'.format(inv - LU))

            print('total time: {0}'.format(inv - start))
Exemplo n.º 14
0
        def thunk():
            inp = inputs[0][0]
            filters = inputs[1][0]

            # output_shape = self.input_shape
            # output_shape[-1] = (output_shape[-1] - 1) * 2 # restore full signal length
            # output_shape = tuple(output_shape)

            z = outputs[0]
            # batch size, input channels, input dim 0, input dim 1
            b, ic, i0, i1, i2 = self.input_shape
            # output channels, input channels, filter dim 0, filter dim 1
            oc, ic_, f0, f1, f2 = self.filter_shape
            # Output shape
            output_shape = [b, oc, i0 - f0 + 1, i1 - f1 + 1, i2 - f2 + 1]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = cuda.CudaNdarray.zeros(output_shape)

            output_pycuda = to_gpuarray(z[0])

            print "Perform Conv"
            output_pycuda = conv.conv3d_fft(inp, filters, output_pycuda,
                                            self.input_shape,
                                            self.filter_shape)
            print "End of conv Conv"
        def thunk():
            inp = inputs[0][0]
            filters = inputs[1][0]

            # output_shape = self.input_shape
            # output_shape[-1] = (output_shape[-1] - 1) * 2 # restore full signal length
            # output_shape = tuple(output_shape)

            z = outputs[0]
            # batch size, input channels, input dim 0, input dim 1
            b, ic, i0, i1, i2 = self.input_shape 
            # output channels, input channels, filter dim 0, filter dim 1
            oc, ic_, f0, f1, f2 = self.filter_shape 
            # Output shape
            output_shape = [b, oc, i0 - f0 + 1, i1 - f1 + 1, i2 - f2 + 1]

            # only allocate if there is no previous allocation of the right size.
            if z[0] is None or z[0].shape != output_shape:
                 z[0] = cuda.CudaNdarray.zeros(output_shape)

            output_pycuda = to_gpuarray(z[0])


            print "Perform Conv"
            output_pycuda = conv.conv3d_fft(inp, filters,
                                            output_pycuda,
                                            self.input_shape,
                                            self.filter_shape)
            print "End of conv Conv"
Exemplo n.º 16
0
def pycuda_zeros(arr, shape):
    if arr is None or arr.shape != shape:
        arr = gpuarray.zeros(shape, dtype=np.float32)
    else:
        if type(arr) != gpuarray.GPUArray:
            arr = to_gpuarray(arr)
    pycu.memset_d32(arr.gpudata, 0, arr.size)
    return arr
Exemplo n.º 17
0
def pycuda_zeros(arr, shape):
    if arr is None or arr.shape != shape:
        arr = gpuarray.zeros(shape, dtype=np.float32)
    else:
        if type(arr) != gpuarray.GPUArray:
            arr = to_gpuarray(arr)
    pycu.memset_d32(arr.gpudata, 0, arr.size)
    return arr
Exemplo n.º 18
0
        def thunk():
            bx = inputs[0]
            by = inputs[1]

            input_shape_x = bx[0].shape # (batch, a, b)
            input_shape_y = by[0].shape # (batch, b, c)

            output_shape = (input_shape_x[0], input_shape_x[1], input_shape_y[2]) # (batch, a, c)

            bz = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if bz[0] is None or bz[0].shape != output_shape:
                bz[0] = cuda.CudaNdarray.zeros(output_shape)

            input_bx_pycuda = to_gpuarray(bx[0])
            input_by_pycuda = to_gpuarray(by[0])
            output_b_pycuda = to_gpuarray(bz[0])

            # fancy native batched version
            gpu_dot_batched(input_bx_pycuda, input_by_pycuda, output_b_pycuda)
Exemplo n.º 19
0
def test_to_gpuarray():
    cx = cuda.CudaNdarray.zeros((5, 4))

    px = to_gpuarray(cx)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
    cx[0, 0] = numpy.asarray(1, dtype="float32")
    # Check that they share the same memory space
    assert px.gpudata == cx.gpudata
    assert numpy.asarray(cx[0, 0]) == 1

    assert numpy.allclose(numpy.asarray(cx), px.get())
    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
    assert all(numpy.asarray(cx._strides) * 4 == px.strides)

    # Test when the CudaNdarray is strided
    cx = cx[::2, ::]
    px = to_gpuarray(cx, copyif=True)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
    cx[0, 0] = numpy.asarray(2, dtype="float32")

    # Check that they do not share the same memory space
    assert px.gpudata != cx.gpudata
    assert numpy.asarray(cx[0, 0]) == 2
    assert not numpy.allclose(numpy.asarray(cx), px.get())

    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
    assert not all(numpy.asarray(cx._strides) * 4 == px.strides)

    # Test that we return an error
    try:
        px = to_gpuarray(cx)
        assert False
    except ValueError:
        pass
Exemplo n.º 20
0
def test_to_gpuarray():
    cx = cuda.CudaNdarray.zeros((5, 4))

    px = to_gpuarray(cx)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
    cx[0, 0] = numpy.asarray(1, dtype="float32")
    # Check that they share the same memory space
    assert px.gpudata == cx.gpudata
    assert numpy.asarray(cx[0, 0]) == 1

    assert numpy.allclose(numpy.asarray(cx), px.get())
    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
    assert all(numpy.asarray(cx._strides) * 4 == px.strides)

    # Test when the CudaNdarray is strided
    cx = cx[::2, ::]
    px = to_gpuarray(cx, copyif=True)
    assert isinstance(px, pycuda.gpuarray.GPUArray)
    cx[0, 0] = numpy.asarray(2, dtype="float32")

    # Check that they do not share the same memory space
    assert px.gpudata != cx.gpudata
    assert numpy.asarray(cx[0, 0]) == 2
    assert not numpy.allclose(numpy.asarray(cx), px.get())

    assert px.dtype == cx.dtype
    assert px.shape == cx.shape
    assert not all(numpy.asarray(cx._strides) * 4 == px.strides)

    # Test that we return an error
    try:
        px = to_gpuarray(cx)
        assert False
    except ValueError:
        pass
Exemplo n.º 21
0
        def thunk():
            grad = outputs[0][0]
            mask_idx = inputs[0][0]
            assert shape_ok(mask_idx.shape)
            s = mask_idx.shape[3]
            block_dim = min(32, s)
            grid_dim = math.ceil(s / block_dim)
            mask_idx = to_gpuarray(mask_idx, copyif=True)

            image = inputs[1][0]
            assert shape_ok(image.shape)
            image = to_gpuarray(image, copyif=True)

            batch_size = min(mask_idx.shape[0], image.shape[0])
            grad_shape = (batch_size, 1, s, s)
            grad = pycuda_zeros(grad, grad_shape)
            grid = (batch_size, grid_dim, grid_dim)
            block = (1, block_dim, block_dim)
            if "sum" in self.connected and "pow" in self.connected:
                og_sum = to_gpuarray(inputs[2][0], copyif=True)
                og_pow = to_gpuarray(inputs[3][0], copyif=True)
                image_mask_split_grad(mask_idx,
                                      image,
                                      og_sum,
                                      og_pow,
                                      np.int32(batch_size),
                                      np.int32(s),
                                      grad,
                                      block=block,
                                      grid=grid)
            elif "sum" in self.connected:
                og_sum = to_gpuarray(inputs[2][0], copyif=True)
                image_mask_split_grad(mask_idx,
                                      image,
                                      og_sum,
                                      np.int32(batch_size),
                                      np.int32(s),
                                      grad,
                                      block=block,
                                      grid=grid)
            elif "pow" in self.connected:
                og_pow = to_gpuarray(inputs[2][0], copyif=True)
                image_mask_split_grad(mask_idx,
                                      image,
                                      og_pow,
                                      np.int32(batch_size),
                                      np.int32(s),
                                      grad,
                                      block=block,
                                      grid=grid)
            outputs[0][0] = to_cudandarray(grad)
Exemplo n.º 22
0
        def thunk():
            grad = outputs[0][0]
            mask_idx = inputs[0][0]
            assert shape_ok(mask_idx.shape)
            s = mask_idx.shape[3]
            block_dim = min(32, s)
            grid_dim = math.ceil(s / block_dim)
            mask_idx = to_gpuarray(mask_idx, copyif=True)

            image = inputs[1][0]
            assert shape_ok(image.shape)
            image = to_gpuarray(image, copyif=True)

            batch_size = min(mask_idx.shape[0], image.shape[0])
            grad_shape = (batch_size, 1, s, s)
            grad = pycuda_zeros(grad, grad_shape)
            grid = (batch_size, grid_dim, grid_dim)
            block = (1, block_dim, block_dim)
            if "sum" in self.connected and "pow" in self.connected:
                og_sum = to_gpuarray(inputs[2][0], copyif=True)
                og_pow = to_gpuarray(inputs[3][0], copyif=True)
                image_mask_split_grad(
                    mask_idx, image, og_sum, og_pow,
                    np.int32(batch_size), np.int32(s), grad,
                    block=block, grid=grid)
            elif "sum" in self.connected:
                og_sum = to_gpuarray(inputs[2][0], copyif=True)
                image_mask_split_grad(
                    mask_idx, image, og_sum,
                    np.int32(batch_size), np.int32(s), grad,
                    block=block, grid=grid)
            elif "pow" in self.connected:
                og_pow = to_gpuarray(inputs[2][0], copyif=True)
                image_mask_split_grad(
                    mask_idx, image, og_pow,
                    np.int32(batch_size), np.int32(s), grad,
                    block=block, grid=grid)
            outputs[0][0] = to_cudandarray(grad)