def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert s matches input shape. # assert (input_shape[1:-1] == s).all() assert (input_shape[-3:-1] == s).all() output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') input_pycuda = inputs[0][0] output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=np.prod(input_shape[:-3])) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.fft(input_pycuda, output_pycuda, plan[0]) # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()
def thunk(): input_shape = inputs[0][0].shape s = inputs[1][0] # Since padding is not supported, assert s matches input shape. # assert (input_shape[1:-1] == s).all() assert (input_shape[1:-1] == s[:-1]).all() # # construct output shape # output_shape = [input_shape[0]] + list(s) # # DFT of real input is symmetric, no need to store # # redundant coefficients # output_shape[-1] = output_shape[-1] // 2 + 1 # # extra dimension with length 2 for real/imag # output_shape += [2] # output_shape = tuple(output_shape) # Output is the same shape as the input (m, ..., n, 2) output_shape = input_shape z = outputs[0] # only allocate if there is no previous allocation of the # right size. if z[0] is None or z[0].shape != output_shape: z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, dtype='float32') # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context, # dtype='float32') input_pycuda = inputs[0][0] # I thought we'd need to change the type on output_pycuda # so it is complex64, but as it turns out skcuda.fft # doesn't really care either way and treats the array as # if it is complex64 anyway. output_pycuda = z[0] with input_pycuda.context: # only initialise plan if necessary if plan[0] is None or plan_input_shape[0] != input_shape: plan_input_shape[0] = input_shape plan[0] = fft.Plan(s, np.complex64, np.complex64, batch=input_shape[0]) # Sync GPU variables before computation input_pycuda.sync() output_pycuda.sync() fft.fft(input_pycuda, output_pycuda, plan[0]) # Sync results to ensure output contains completed computation pycuda.driver.Context.synchronize()