def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(float *float_data, int *int_data) { float_data[0] = (float)int_data[0]; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel'] int_data[0] = 5 int_data[1] = 2 int_data[2] = 4 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('float_data[0]', float_data[0]) # expected = pow(float_data[1], float_data[2]) assert float_data[0] == 5
def test_pow(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[0] = pow(data[1], data[2]); data[3] = pow(data[4], data[5]); data[5] = pow(data[7], data[8]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[1] = 1.5 float_data[2] = 4.6 float_data[4] = -1.5 float_data[5] = 4.6 float_data[7] = 1.5 float_data[8] = -4.6 cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[3]', float_data[3]) print('float_data[6]', float_data[6]) expected = pow(float_data[1], float_data[2]) assert abs(float_data[0] - expected) <= 1e-4
def test_sqrt(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[threadIdx.x] = sqrt(data[threadIdx.x]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[0] = 1.5 float_data[1] = 4.6 float_data[2] = -1.5 float_data[3] = 0 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:4]', float_data[:4]) for i in range(4): if float_data_orig[i] >= 0: assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4 else: assert math.isnan(float_data[i])
def test_return(context): """ check address space of a function taht returns a global pointer """ source = """ __device__ float *myfunc(); __device__ float *foo(float *in) { if(in[0] > 0) { return in + 10; } else { return in + 27; } } __global__ void mykernel(float *a) { foo(a); } """ kernelName = test_common.mangle('mykernel', ['float*']) dict = test_common.compile_code_v3(cl, context, source, kernelName, num_clmems=1) # prog = dict['prog'] cl_sourcecode = dict['cl_sourcecode'] print('cl_sourcecode', cl_sourcecode)
def test_singlebuffer_sqrt_cocl(context, queue): """ Test doing stuff with one single large buffer for destination and source, just offset a bit """ code = """ __global__ void myKernel(float *data0, float *data1, int N) { if(threadIdx.x < N) { data0[threadIdx.x] = sqrt(data1[threadIdx.x]); } } """ mangledName = '_Z8myKernelPfS_i' kernel = test_common.compile_code_v3(cl, context, code, mangledName, num_clmems=2)['kernel'] N = 10 src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset = 128 dst_offset = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset, size=N * 4) global_size = 256 workgroup_size = 256 # scratch = workgroup_size * 4 kernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu, offset_type(0), huge_buf_gpu, offset_type(0), offset_type(dst_offset), offset_type(src_offset), np.int32(N), cl.LocalMemory(4)) queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset, size=N * 4) queue.finish() print('src_host', src_host) print('dst_host', dst_host) print('np.sqrt(src_host)', np.sqrt(src_host)) assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
def test_structs(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ struct MyStruct { int myint; float myfloat; }; __global__ void testStructs(MyStruct *structs, float *float_data, int *int_data) { int_data[0] = structs[0].myint; float_data[0] = structs[0].myfloat; float_data[1] = structs[1].myfloat; } """ kernel = test_common.compile_code_v3( cl, context, code, test_common.mangle('testStructs', ['MyStruct *', 'float *', 'int *']), num_clmems=3)['kernel'] # my_struct = np.dtype([("myfloat", np.float32), ("myint", np.int32)]) # I dont know why, but seems these are back to front... my_struct = np.dtype([ ("myint", np.int32), ("myfloat", np.float32) ]) # seems these are wrong way around on HD5500. Works ok on 940M my_struct, my_struct_c_decl = pyopencl.tools.match_dtype_to_c_struct( context.devices[0], "MyStruct", my_struct) my_struct = cl.tools.get_or_register_dtype("MyStruct", my_struct) structs = np.empty(2, my_struct) structs[0]['myint'] = 123 structs[0]['myfloat'] = 567 structs[1]['myint'] = 33 structs[1]['myfloat'] = 44 structs_gpu = cl.array.to_device(q, structs) # p = structs_gpu.map_to_host(q) # print('p', p) # q.finish() kernel(q, (32, ), (32, ), structs_gpu.data, offset_type(0), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), offset_type(0), offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[0]', int_data[0]) print('int_data[1]', int_data[1]) print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) assert int_data[0] == 123 assert float_data[0] == 567 assert float_data[1] == 44
def test_nested_for(context, q, float_data, float_data_gpu): source = """ __device__ void myfunc(float *data, int a, int b) { data[1] = data[2]; float sum = 0; for(int i = 0; i < a; i++) { for(int j = 0; j < b; j++) { sum += data[i * 17 + j * 7]; } } data[0] = sum; } __global__ void mykernel(float *data, int a, int b) { myfunc(data, a, b); } """ kernelName = test_common.mangle('mykernel', ['float *', 'int', 'int']) kernel = test_common.compile_code_v3(cl, context, source, kernelName, num_clmems=1)['kernel'] float_data_orig = np.copy(float_data) a = 2 b = 3 kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(a), np.int32(b), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) sum = 0 for i in range(a): for j in range(b): sum += float_data_orig[i * 17 + j * 7] print('float_data_orig', float_data_orig[:3]) print('float_data', float_data[:3]) assert float_data[1] == float_data_orig[2] assert abs(float_data[0] - sum) <= 1e-4
def test_clz(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(int *int_data) { int_data[0] = __clz(int_data[1]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['int *']), num_clmems=1)['kernel'] int_data[1] = 15 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), int_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[:2]', int_data[:2])
def test_floatconstants(context, q, float_data, float_data_gpu): code = """ __device__ float4 getvals() { return make_float4(0xFFF0000000000000, 0x7FF0000000000000, INFINITY, -INFINITY); } __global__ void myKernel(float *data) { data[0] = 18442240474082181120.0f; // 0xFFF0000000000000 data[1] = 9218868437227405312.0f; // 0x7FF0000000000000 float4 vals = getvals(); data[2] = vals.x; data[3] = vals.y; data[4] = vals.w; data[5] = vals.z; data[6] = INFINITY; data[7] = -INFINITY; // data[8] = 0xFFEFFFFFFFFFFFFF; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) print('float_data[2]', float_data[2]) print('float_data[3]', float_data[3]) print('float_data[4]', float_data[4]) print('float_data[5]', float_data[5]) print('float_data[6]', float_data[6]) print('float_data[7]', float_data[7]) # print('float_data[8]', float_data[8]) assert float_data[0] > 100000000 assert float_data[1] > 100000000 assert float_data[2] > 100000000 assert float_data[3] > 100000000 assert float_data[4] == - np.inf assert float_data[5] == np.inf assert float_data[6] == np.inf assert float_data[7] == - np.inf
def test_fptosi(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(float *float_data, int *int_data) { int_data[0] = (int)float_data[0]; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']))['kernel'] float_data[0] = 4.7 float_data[1] = 1.5 float_data[2] = 4.6 cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[0]', int_data[0]) # expected = pow(float_data[1], float_data[2]) assert int_data[0] == 4