def thunk(): mask_idx = inputs[0][0] image = inputs[1][0] batch_size = min(mask_idx.shape[0], image.shape[0]) assert shape_ok(mask_idx.shape) assert shape_ok(image.shape) mask_idx = to_gpuarray(mask_idx) image = to_gpuarray(image) s = mask_idx.shape[3] assert mask_idx.shape[2] == mask_idx.shape[3], \ "height and width must be equal" sdata_shape = (3*len(MASK), batch_size, 1, s, s) self._sdata = pycuda_zeros(self._sdata, sdata_shape) blocks_max = 32 blocks_s = min(blocks_max, s) grid_s = math.ceil(s / blocks_max) grid = (batch_size, grid_s, grid_s) block = (1, blocks_s, blocks_s) image_mask_split(mask_idx, image, np.int32(batch_size), np.int32(s), self._sdata, block=block, grid=grid) sdata_as_theano = to_cudandarray(self._sdata) m = len(MASK) outputs[0][0] = sdata_as_theano[:m] outputs[1][0] = sdata_as_theano[m:2*m] outputs[2][0] = sdata_as_theano[2*m:]
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = list(input_shape) # DFT of real input is symmetric, no need to store # redundant coefficients output_shape[-1] = output_shape[-1] // 2 + 1 # extra dimension with length 2 for real/imag output_shape += [2] output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = tuple(input_shape) # print 'FFT shapes:', input_shape, '->', output_shape # print 'Batch size:', input_shape[0] # print 'Core shape:', input_shape[1:-1] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(shape=input_shape[1:-1], # Exclude batch dim and complex dim in_dtype=np.complex64, out_dtype=np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def thunk(): mask_idx = inputs[0][0] image = inputs[1][0] batch_size = min(mask_idx.shape[0], image.shape[0]) assert shape_ok(mask_idx.shape) assert shape_ok(image.shape) mask_idx = to_gpuarray(mask_idx) image = to_gpuarray(image) s = mask_idx.shape[3] assert mask_idx.shape[2] == mask_idx.shape[3], \ "height and width must be equal" sdata_shape = (3 * len(MASK), batch_size, 1, s, s) self._sdata = pycuda_zeros(self._sdata, sdata_shape) blocks_max = 32 blocks_s = min(blocks_max, s) grid_s = math.ceil(s / blocks_max) grid = (batch_size, grid_s, grid_s) block = (1, blocks_s, blocks_s) image_mask_split(mask_idx, image, np.int32(batch_size), np.int32(s), self._sdata, block=block, grid=grid) sdata_as_theano = to_cudandarray(self._sdata) m = len(MASK) outputs[0][0] = sdata_as_theano[:m] outputs[1][0] = sdata_as_theano[m:2 * m] outputs[2][0] = sdata_as_theano[2 * m:]
def thunk(): input_shape = inputs[0][0].shape # construct output shape # chop off the extra length-2 dimension for real/imag output_shape = list(input_shape[:-1]) # restore full signal length output_shape[-1] = (output_shape[-1] - 1) * 2 output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def thunk(): input_shape = inputs[0][0].shape size = input_shape[1] # matrices to invert are (size x size) batch_size = input_shape[0] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != input_shape: z[0] = cuda.CudaNdarray.zeros(input_shape) pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), np.int32) info_alloc[0] = pycuda.gpuarray.zeros(batch_size, np.int32) input_pycuda = to_gpuarray(inputs[0][0]) output_pycuda = to_gpuarray(z[0]) pivot = pivot_alloc[0] info = info_alloc[0] # construct pointer arrays for batched operations input_arr = bptrs(input_pycuda) output_arr = bptrs(output_pycuda) if not self.destructive: input_pycuda = input_pycuda.copy() # to prevent destruction of the input handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size) # the LU factorization is now in input_pycuda (destructive operation!) # use factorization to perform inversion cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size, info.gpudata, batch_size)
def thunk(): start = time() input_shape = inputs[0][0].shape size = input_shape[1] # matrices to invert are (size x size) batch_size = input_shape[0] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != input_shape: z[0] = theano.sandbox.cuda.CudaNdarray.zeros(input_shape) pivot_alloc[0] = pycuda.gpuarray.empty((batch_size, size), numpy.int32) info_alloc[0] = pycuda.gpuarray.zeros(batch_size, numpy.int32) input_pycuda = to_gpuarray(inputs[0][0]) output_pycuda = to_gpuarray(z[0]) pivot = pivot_alloc[0] info = info_alloc[0] init = time() print('init time:{0}'.format(init - start)) if not self.destructive: input_pycuda = input_pycuda.copy() # to prevent destruction of the input # construct pointer arrays for batched operations input_arr = bptrs(input_pycuda) output_arr = bptrs(output_pycuda) alloc = time() print('allocation time:{0}'.format(alloc - init)) handle = scikits.cuda.misc._global_cublas_handle # perform LU factorization cublas.cublasSgetrfBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, info.gpudata, batch_size) # the LU factorization is now in input_pycuda (destructive operation!) LU = time() print('LU time:{0}'.format(LU - alloc)) # use factorization to perform inversion cublas.cublasSgetriBatched(handle, size, input_arr.gpudata, size, pivot.gpudata, output_arr.gpudata, size, info.gpudata, batch_size) # the inverted matrices are now in output_pycuda inv = time() print('inv time:{0}'.format(inv - LU)) print('total time: {0}'.format(inv - start))
def thunk(): inp = inputs[0][0] filters = inputs[1][0] # output_shape = self.input_shape # output_shape[-1] = (output_shape[-1] - 1) * 2 # restore full signal length # output_shape = tuple(output_shape) z = outputs[0] # batch size, input channels, input dim 0, input dim 1 b, ic, i0, i1, i2 = self.input_shape # output channels, input channels, filter dim 0, filter dim 1 oc, ic_, f0, f1, f2 = self.filter_shape # Output shape output_shape = [b, oc, i0 - f0 + 1, i1 - f1 + 1, i2 - f2 + 1] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != output_shape: z[0] = cuda.CudaNdarray.zeros(output_shape) output_pycuda = to_gpuarray(z[0]) print "Perform Conv" output_pycuda = conv.conv3d_fft(inp, filters, output_pycuda, self.input_shape, self.filter_shape) print "End of conv Conv"
def pycuda_zeros(arr, shape): if arr is None or arr.shape != shape: arr = gpuarray.zeros(shape, dtype=np.float32) else: if type(arr) != gpuarray.GPUArray: arr = to_gpuarray(arr) pycu.memset_d32(arr.gpudata, 0, arr.size) return arr
def thunk(): bx = inputs[0] by = inputs[1] input_shape_x = bx[0].shape # (batch, a, b) input_shape_y = by[0].shape # (batch, b, c) output_shape = (input_shape_x[0], input_shape_x[1], input_shape_y[2]) # (batch, a, c) bz = outputs[0] # only allocate if there is no previous allocation of the right size. if bz[0] is None or bz[0].shape != output_shape: bz[0] = cuda.CudaNdarray.zeros(output_shape) input_bx_pycuda = to_gpuarray(bx[0]) input_by_pycuda = to_gpuarray(by[0]) output_b_pycuda = to_gpuarray(bz[0]) # fancy native batched version gpu_dot_batched(input_bx_pycuda, input_by_pycuda, output_b_pycuda)
def test_to_gpuarray(): cx = cuda.CudaNdarray.zeros((5, 4)) px = to_gpuarray(cx) assert isinstance(px, pycuda.gpuarray.GPUArray) cx[0, 0] = numpy.asarray(1, dtype="float32") # Check that they share the same memory space assert px.gpudata == cx.gpudata assert numpy.asarray(cx[0, 0]) == 1 assert numpy.allclose(numpy.asarray(cx), px.get()) assert px.dtype == cx.dtype assert px.shape == cx.shape assert all(numpy.asarray(cx._strides) * 4 == px.strides) # Test when the CudaNdarray is strided cx = cx[::2, ::] px = to_gpuarray(cx, copyif=True) assert isinstance(px, pycuda.gpuarray.GPUArray) cx[0, 0] = numpy.asarray(2, dtype="float32") # Check that they do not share the same memory space assert px.gpudata != cx.gpudata assert numpy.asarray(cx[0, 0]) == 2 assert not numpy.allclose(numpy.asarray(cx), px.get()) assert px.dtype == cx.dtype assert px.shape == cx.shape assert not all(numpy.asarray(cx._strides) * 4 == px.strides) # Test that we return an error try: px = to_gpuarray(cx) assert False except ValueError: pass
def thunk(): grad = outputs[0][0] mask_idx = inputs[0][0] assert shape_ok(mask_idx.shape) s = mask_idx.shape[3] block_dim = min(32, s) grid_dim = math.ceil(s / block_dim) mask_idx = to_gpuarray(mask_idx, copyif=True) image = inputs[1][0] assert shape_ok(image.shape) image = to_gpuarray(image, copyif=True) batch_size = min(mask_idx.shape[0], image.shape[0]) grad_shape = (batch_size, 1, s, s) grad = pycuda_zeros(grad, grad_shape) grid = (batch_size, grid_dim, grid_dim) block = (1, block_dim, block_dim) if "sum" in self.connected and "pow" in self.connected: og_sum = to_gpuarray(inputs[2][0], copyif=True) og_pow = to_gpuarray(inputs[3][0], copyif=True) image_mask_split_grad(mask_idx, image, og_sum, og_pow, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) elif "sum" in self.connected: og_sum = to_gpuarray(inputs[2][0], copyif=True) image_mask_split_grad(mask_idx, image, og_sum, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) elif "pow" in self.connected: og_pow = to_gpuarray(inputs[2][0], copyif=True) image_mask_split_grad(mask_idx, image, og_pow, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) outputs[0][0] = to_cudandarray(grad)
def thunk(): grad = outputs[0][0] mask_idx = inputs[0][0] assert shape_ok(mask_idx.shape) s = mask_idx.shape[3] block_dim = min(32, s) grid_dim = math.ceil(s / block_dim) mask_idx = to_gpuarray(mask_idx, copyif=True) image = inputs[1][0] assert shape_ok(image.shape) image = to_gpuarray(image, copyif=True) batch_size = min(mask_idx.shape[0], image.shape[0]) grad_shape = (batch_size, 1, s, s) grad = pycuda_zeros(grad, grad_shape) grid = (batch_size, grid_dim, grid_dim) block = (1, block_dim, block_dim) if "sum" in self.connected and "pow" in self.connected: og_sum = to_gpuarray(inputs[2][0], copyif=True) og_pow = to_gpuarray(inputs[3][0], copyif=True) image_mask_split_grad( mask_idx, image, og_sum, og_pow, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) elif "sum" in self.connected: og_sum = to_gpuarray(inputs[2][0], copyif=True) image_mask_split_grad( mask_idx, image, og_sum, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) elif "pow" in self.connected: og_pow = to_gpuarray(inputs[2][0], copyif=True) image_mask_split_grad( mask_idx, image, og_pow, np.int32(batch_size), np.int32(s), grad, block=block, grid=grid) outputs[0][0] = to_cudandarray(grad)