def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = list(input_shape) # DFT of real input is symmetric, no need to store # redundant coefficients output_shape[-1] = output_shape[-1] // 2 + 1 # extra dimension with length 2 for real/imag output_shape += [2] output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape # construct output shape # chop off the extra length-2 dimension for real/imag output_shape = list(input_shape[:-1]) # restore full signal length output_shape[-1] = (output_shape[-1] - 1) * 2 output_shape = tuple(output_shape) z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0])
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def thunk(): input_shape = inputs[0][0].shape output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # input_pycuda is a float32 array with an extra dimension, # but will be interpreted by scikits.cuda as a complex64 # array instead. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(output_shape[1:-1], np.complex64, np.complex64, batch=output_shape[0]) fft.ifft(input_pycuda, output_pycuda, plan[0]) compute_map[node.outputs[0]][0] = True
def thunk(): input_shape = inputs[0][0].shape # construct output shape output_shape = tuple(input_shape) # print 'FFT shapes:', input_shape, '->', output_shape # print 'Batch size:', input_shape[0] # print 'Core shape:', input_shape[1:-1] z = outputs[0] # only allocate if there is no previous allocation of the right size. if z[0] is None or z[0].shape != output_shape: z[0] = CudaNdarray.zeros(output_shape) input_pycuda = to_gpuarray(inputs[0][0]) # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out scikits.cuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = to_gpuarray(z[0]) # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(shape=input_shape[1:-1], # Exclude batch dim and complex dim in_dtype=np.complex64, out_dtype=np.complex64, batch=input_shape[0]) fft.fft(input_pycuda, output_pycuda, plan[0])
def test_cross_map_norm_noncontiguous_grad(): # Check the case reported at https://groups.google.com/d/topic/pylearn-users/KxIYc3hczf4/discussion x = cuda_ftensor4('x') x_shuffled = x.dimshuffle(1, 2, 3, 0) x_shuffled = gpu_contiguous(x_shuffled) response_norm = CrossMapNorm(size_f=16, add_scale=(15. / 16.), pow_scale=1, blocked=True) output_shuffled = response_norm(x_shuffled)[0] output = output_shuffled.dimshuffle(3, 0, 1, 2) cost = output.sum() cost.name = 'cost' grad_x = theano.grad(cost, x) f = theano.function([x], grad_x, mode=mode_with_gpu) x_val = CudaNdarray(numpy.ones((2, 16, 2, 2), dtype='float32')) f(x_val)
def thunk(): bx = inputs[0] by = inputs[1] input_shape_x = bx[0].shape # (batch, a, b, 2) input_shape_y = by[0].shape # (batch, b, c, 2) output_shape = (input_shape_x[0], input_shape_x[1], input_shape_y[2], 2) # (batch, a, c, 2) bz = outputs[0] # only allocate if there is no previous allocation of the # right size. if bz[0] is None or bz[0].shape != output_shape: bz[0] = CudaNdarray.zeros(output_shape) input_bx_pycuda = to_complex_gpuarray(bx[0]) input_by_pycuda = to_complex_gpuarray(by[0]) output_b_pycuda = to_complex_gpuarray(bz[0]) # fancy native batched version sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda, output_b_pycuda)
def thunk(): global cusolver_handle # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] # A is not explicitly converted between C and F order, instead we # switch the "transpose" flag. if self.trans in ('T', 'C'): trans = 'N' else: trans = 'T' # Convert b to F-order from C-order. b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1])) # This copy forces allocation of a new C-contiguous buffer # and returns it. A_cpy = A.copy() b_cpy = b_cpy.copy() assert (len(A.shape) == 2) assert (len(b.shape) == 2) if trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape if n != k: raise ValueError('A and b must be aligned.') elif trans in ['N']: trans = 0 n, l = A.shape k, m = b.shape if l != m: raise ValueError('A and b must be aligned.') else: raise ValueError('Invalid value for trans') lda = max(1, n) ldb = max(1, n, l) A_ptr = A_cpy.gpudata b_ptr = b_cpy.gpudata if cusolver_handle is None: cusolver_handle = cusolver.cusolverDnCreate() print('cusolver handle', cusolver_handle) workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, m, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = CudaNdarray.zeros((workspace_size, )) if thunk.pivots is None or thunk.pivots.size != min(m, n): thunk.pivots = CudaNdarray.zeros((min(m, n), )) if thunk.dev_info is None: thunk.dev_info = CudaNdarray.zeros((1, )) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf(cusolver_handle, n, l, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) # Convert b to F-order from C-order and assign it to output. b_cpy = b_cpy.reshape(b.shape[::-1]) b_cpy = dimshuffle(b_cpy, (1, 0)) z[0] = b_cpy
def test_cross_map_norm_simple(): op = CrossMapNorm(16, 15. / 16., 1., True) x = CudaNdarray(numpy.ones((16, 2, 2, 2), dtype='float32')) x_ = theano.tensor.TensorVariable(CudaNdarrayType([False] * 4)) f = theano.function([x_], op(x_)[0]) numpy.testing.assert_allclose(f(x), 0.0625)