data_buffer, local_buffer, points_per_group, np.uint32(NUM_POINTS), direction) # There is some overhead involved with spawning a new kernel (code caching) # A good rule of thumb is therefore to create the kernel object outside of loops # Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html kernel_stage = prog.fft_stage # Enqueue further stages of the FFT if NUM_POINTS > points_per_group: # for(stage = 2; stage <= num_points/points_per_group; stage <<= 1) for stage in utility.range_bitwise_shift(low=2, high=NUM_POINTS//points_per_group + 1, n=1): print('Stage: ' + str(stage)) # fft_stage(__global float2* g_data, uint stage, uint points_per_group, int dir) kernel_stage(queue, (global_size,), (local_size,), data_buffer, np.uint32(stage), points_per_group, direction) # Scale values if performing the inverse FFT if not FORWARD_FFT: # fft_scale(__global float2* g_data, uint points_per_group, uint scale) prog.fft_scale(queue, (global_size,), (local_size,), data_buffer, points_per_group, np.uint32(NUM_POINTS)) # Read results cl.enqueue_copy(queue, dest=output_data, src=data_buffer, is_blocking=True)
global_size = (NUM_POINTS // points_per_group) * local_size print('Global size: ' + str(global_size)) print('Local size\n: ' + str(local_size)) kernel_init(queue, (global_size, ), (local_size, ), data_buffer, local_buffer, points_per_group, np.uint32(NUM_POINTS), direction) # There is some overhead involved with spawning a new kernel (code caching) # A good rule of thumb is therefore to create the kernel object outside of loops # Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html kernel_stage = prog.fft_stage # Enqueue further stages of the FFT if NUM_POINTS > points_per_group: # for(stage = 2; stage <= num_points/points_per_group; stage <<= 1) for stage in utility.range_bitwise_shift( low=2, high=NUM_POINTS // points_per_group + 1, n=1): print('Stage: ' + str(stage)) # fft_stage(__global float2* g_data, uint stage, uint points_per_group, int dir) kernel_stage(queue, (global_size, ), (local_size, ), data_buffer, np.uint32(stage), points_per_group, direction) # Scale values if performing the inverse FFT if not FORWARD_FFT: # fft_scale(__global float2* g_data, uint points_per_group, uint scale) prog.fft_scale(queue, (global_size, ), (local_size, ), data_buffer, points_per_group, np.uint32(NUM_POINTS)) # Read results cl.enqueue_copy(queue, dest=output_data, src=data_buffer, is_blocking=True) # Change to array of complex values
local_size = (local_size,) # Enqueue initial sorting kernel # bsort_init(__global float4 *g_data, __local float4 *l_data) prog.bsort_init(queue, global_size, local_size, data_buffer, local_buffer) # There is some overhead involved with spawning a new kernel (code caching) # A good rule of thumb is therefore to create the kernel object outside of loops # Ref: https://lists.tiker.net/pipermail/pyopencl/2016-February/002107.html kernel_stage_n = prog.bsort_stage_n kernel_stage_0 = prog.bsort_stage_0 kernel_merge = prog.bsort_merge # Enqueue further stages num_stages = global_size[0] // local_size[0] for high_stage in utility.range_bitwise_shift(low=2, high=num_stages, n=1): for stage in utility.range_bitwise_shift(low=1, high=high_stage, n=-1): # bsort_stage_n(__global float4 *g_data, __local float4 *l_data, uint stage, uint high_stage) kernel_stage_n(queue, global_size, local_size, data_buffer, local_buffer, np.int32(stage), np.int32(high_stage)) # bsort_stage_0(__global float4 *g_data, __local float4 *l_data, uint high_stage) kernel_stage_0(queue, global_size, local_size, data_buffer, local_buffer, np.int32(high_stage)) # Perform the bitonic merge for stage in utility.range_bitwise_shift(low=1, high=num_stages, n=-1): # bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) kernel_merge(queue, global_size, local_size, data_buffer, local_buffer, np.int32(stage), direction) # bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir)