def test_ternary(context, q, float_data, float_data_gpu): kernelSource = """ __global__ void setValue(float *data, int idx, float value) { if(threadIdx.x == 0) { data[idx] = value; } } __global__ void testTernary(float *data) { data[0] = data[1] > 0 ? data[2] : data[3]; } """ setValueKernelName = test_common.mangle('setValue', ['float *', 'int', 'float']) setValueProg = compile_code(cl, context, kernelSource, setValueKernelName, num_clmems=1) testTernaryName = test_common.mangle('testTernary', ['float *']) testTernaryProg = compile_code(cl, context, kernelSource, testTernaryName, num_clmems=1) float_data_orig = np.copy(float_data) def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data_gpu, float_data) print('float_data[:8]', float_data[:8]) set_float_value(float_data_gpu, 1, 10) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[2] set_float_value(float_data_gpu, 1, -2) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[3]
def test_sincos(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { sincosf(0.1, &data[0], &data[1]); sincosf(data[2], &data[3], &data[4]); } """ kernel_name = test_common.mangle('mykernel', ['float*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) print('cl_code', cl_code) float_data[2] = -0.3 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:5]) assert abs(float_data[0] - math.sin(0.1)) < 1e-4 assert abs(float_data[1] - math.cos(0.1)) < 1e-4 assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4 assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
def test_insertvalue(context, q, float_data, float_data_gpu): sourcecode = """ struct mystruct { int f0; float f1; }; __device__ struct mystruct doSomething(struct mystruct foo, int somevalue); __device__ struct mystruct doSomething(struct mystruct foo, int somevalue) { foo.f0 = somevalue; foo.f1 = 4.5f; return foo; } __global__ void somekernel(float *data) { struct mystruct foo; foo.f0 = 3; foo.f1 = 4.5; foo = doSomething(foo, data[2]); data[0] = (int)foo.f0; data[1] = foo.f1; } """ mangledname = test_common.mangle('somekernel', ['float *']) cl_code = test_common.cu_to_cl(sourcecode, mangledname) kernel = test_common.build_kernel(context, cl_code, mangledname)
def test_short_names(context): cu_source = """ __device__ void funca(float *d); __device__ void funca(float *d) { d[1] = 1.0f; } __device__ void funcb(float *d, int c) { d[2] = 3.0f + 5 - d[c]; } __global__ void funck(float *data) { data[0] = 123.0f; funca(data); funcb(data, (int)data[6]); for(int i = 0; i < 1000; i++) { funcb(data + i, (int)data[i + 100]); } } """ mangled_name = test_common.mangle('funck', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_ieeefloats(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(double *data) { double d_neginfinity = -INFINITY; double d_posinfinity = INFINITY; float f_neginfinity = -INFINITY; float f_posinfinity = INFINITY; data[0] = INFINITY; data[1] = -INFINITY; data[2] = f_neginfinity; data[3] = f_posinfinity; } """ kernel_name = test_common.mangle('mykernel', ['double*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:4]) assert float_data[0] == np.inf assert float_data[1] == - np.inf assert float_data[2] == - np.inf assert float_data[3] == np.inf
def test_pow(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[0] = pow(data[1], data[2]); data[3] = pow(data[4], data[5]); data[5] = pow(data[7], data[8]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[1] = 1.5 float_data[2] = 4.6 float_data[4] = -1.5 float_data[5] = 4.6 float_data[7] = 1.5 float_data[8] = -4.6 cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[3]', float_data[3]) print('float_data[6]', float_data[6]) expected = pow(float_data[1], float_data[2]) assert abs(float_data[0] - expected) <= 1e-4
def test_return(context): """ check address space of a function taht returns a global pointer """ source = """ __device__ float *myfunc(); __device__ float *foo(float *in) { if(in[0] > 0) { return in + 10; } else { return in + 27; } } __global__ void mykernel(float *a) { foo(a); } """ kernelName = test_common.mangle('mykernel', ['float*']) dict = test_common.compile_code_v3(cl, context, source, kernelName, num_clmems=1) # prog = dict['prog'] cl_sourcecode = dict['cl_sourcecode'] print('cl_sourcecode', cl_sourcecode)
def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(float *float_data, int *int_data) { float_data[0] = (float)int_data[0]; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel'] int_data[0] = 5 int_data[1] = 2 int_data[2] = 4 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('float_data[0]', float_data[0]) # expected = pow(float_data[1], float_data[2]) assert float_data[0] == 5
def test_inlining(context, q, float_data, float_data_gpu): cu_source = """ __global__ void myKernel(float *data) { data[0] = (data[3] * (data[1] + data[2])) / data[4]; data[7] = (data[3] / (data[1] - data[2])) * data[4]; } """ kernelName = test_common.mangle('myKernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(10): float_data[i] = i + 3 cl.enqueue_copy(q, float_data_gpu, float_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() float_data2 = np.zeros((1024, ), dtype=np.float32) cl.enqueue_copy(q, float_data2, float_data_gpu) q.finish() print('float_data2[0]', float_data2[0]) d = float_data d2 = float_data2 expect = (d[3] * (d[1] + d[2])) / d[4] assert abs(d2[0] - expect) < 1e-5
def test_int_expressions(context, q, int_data, int_data_gpu): cu_source = """ __global__ void myKernel(int *data) { data[0] = (data[10] | data[11]) == data[12]; } """ kernelName = test_common.mangle('myKernel', ['int *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(50): int_data[i] = 0 int_data[10] = 2 int_data[11] = 1 int_data[12] = 1 cl.enqueue_copy(q, int_data_gpu, int_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() gpu_data = np.zeros((1024, ), dtype=np.int32) cl.enqueue_copy(q, gpu_data, int_data_gpu) q.finish() data = int_data actual = gpu_data[0] expected = int((data[10] | data[11]) == data[12]) print('actual', actual, 'expected', expected) assert actual == expected
def set_float_value(gpu_buffer, idx, value): testcudakernel1.__getattr__( test_common.mangle('setValue', ['float *', 'int', 'float']))(q, (32, ), (32, ), float_data_gpu, np.int32(idx), np.float32(value))
def test_test_for(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testFor(float *data, int N) { if(threadIdx.x == 0) { float sum = 0.0f; for(int i = 0; i < N; i++) { sum += data[i]; } data[0] = sum; } } """ kernelName = test_common.mangle('testFor', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 4 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) sum = 0 for i in range(N): sum += float_data_orig[i] assert abs(float_data[0] - sum) <= 1e-4
def test_test_if_else(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; if(tid < N) { data[tid] *= 2; } else { data[tid] += 5; } } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) for i in range(10): if i < N: assert float_data[i] == float_data_orig[i] * 2 else: assert abs(float_data[i] - float_data_orig[i] - 5) <= 1e-4
def test_sqrt(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[threadIdx.x] = sqrt(data[threadIdx.x]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[0] = 1.5 float_data[1] = 4.6 float_data[2] = -1.5 float_data[3] = 0 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:4]', float_data[:4]) for i in range(4): if float_data_orig[i] >= 0: assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4 else: assert math.isnan(float_data[i])
def test_test_do_while(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; int i = 0; float sum = 0; do { sum += data[i]; i++; } while(sum < 15); data[0] = sum; } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) print(float_data[0])
def test_alloca(context, q, float_data, float_data_gpu): code = """ __global__ void mykernel(float *data) { float *foo = data; foo[0] = data[1] + data[2] + data[3]; } """ kernelName = test_common.mangle('mykernel', ['float *']) try: dict = test_common.compile_code_v2(cl, context, code, kernelName, num_clmems=1) prog = dict['prog'] cl_sourcecode = dict['cl_sourcecode'] print('cl_sourcecode', cl_sourcecode) except Exception as e: with open('/tmp/testprog-device.cl', 'r') as f: print(f.read()) raise e print('type(offset_type(0))', type(offset_type(0))) prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
def test_copy_float(cuSourcecode, context, q, float_data, float_data_gpu): kernelName = test_common.mangle('copy_float', ['float *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data[1]
def test_use_template1(context, q, int_data, int_data_gpu, float_data, float_data_gpu): code = """ template< typename T > __device__ T addNumbers(T one, T two) { return one + two; } __global__ void use_template1(float *data, int *intdata) { if(threadIdx.x == 0 && blockIdx.x == 0) { data[0] = addNumbers(data[1], data[2]); intdata[0] = addNumbers(intdata[1], intdata[2]); } } """ kernelName = test_common.mangle('use_template1', ['float *', 'int *']) prog = compile_code(cl, context, code, kernelName) float_data_orig = np.copy(float_data) int_data_orig = np.copy(int_data) prog.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert float_data[0] == float_data_orig[1] + float_data_orig[2] assert int_data[0] == int_data_orig[1] + int_data_orig[2]
def test_long_conflicting_names(context, q): cu_source = """ __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(float *d) { d[1] = 1.0f; } __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(float *d) { d[2] = 3.0f; } __global__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec(float *data) { data[0] = 123.0f; mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(data); mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(data); } """ mangled_name = test_common.mangle('mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) for line in cl_source.split("\n"): if line.strip().startswith('/*'): continue if not line.strip().replace('kernel ', '').strip().startswith('void'): continue name = line.replace('kernel ', '').replace('void ', '').split('(')[0] if name != '': print('name', name) assert len(name) <= 32 test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_float4(testcudakernel1, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) testcudakernel1.__getattr__(test_common.mangle( 'testFloat4', ['float4 *']))(q, (32, ), (32, ), float_data_gpu) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) assert float_data[1] == float_data_orig[4 + 2] * float_data_orig[4 + 3]
def test_testFor(testcudakernel1, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) testcudakernel1.__getattr__( test_common.mangle('testFor', ['float *', 'int']))(q, (32, ), (32, ), float_data_gpu, np.int32(32)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert abs(float_data[0] - sum(float_data_orig[0:32])) < 1e-4
def test_use_tid2(cuSourcecode, context, q, int_data, int_data_gpu): int_data_orig = np.copy(int_data) kernelName = test_common.mangle('use_tid2', ['int *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), int_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert int_data[0] == int_data_orig[0] + 0 assert int_data[10] == int_data_orig[10] + 10 assert int_data[31] == int_data_orig[31] + 31
def test_structs(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ struct MyStruct { int x; float y; }; __global__ void testStructs(MyStruct *structs, float *float_data, int *int_data) { int_data[0] = structs[0].x; float_data[0] = structs[0].y; float_data[1] = structs[1].y; } """ for file in os.listdir('/tmp'): if file.startswith('test_cloutput'): os.unlink('/tmp/%s' % file) with open('/tmp/test_cloutput.cu', 'w') as f: f.write(code) print(subprocess.check_output(['cocl', '-c', '/tmp/test_cloutput.cu'])) with open('/tmp/test_cloutput-device.cl', 'r') as f: sourcecode = f.read() prog = cl.Program(context, sourcecode).build() # my_struct = np.dtype([("x", np.float32), ("y", np.int32)]) # I dont know why, but seems these are back to front... my_struct = np.dtype([("x", np.int32), ("y", np.float32)]) my_struct, my_struct_c_decl = pyopencl.tools.match_dtype_to_c_struct( context.devices[0], "MyStruct", my_struct) my_struct = cl.tools.get_or_register_dtype("MyStruct", my_struct) structs = np.empty(2, my_struct) structs[0]['x'] = 123 structs[0]['y'] = 567 structs[1]['x'] = 33 structs[1]['y'] = 44 structs_gpu = cl.array.to_device(q, structs) # p = structs_gpu.map_to_host(q) # print('p', p) # q.finish() prog.__getattr__( test_common.mangle('testStructs', ['MyStruct *', 'float *', 'int *']))( q, (32, ), (32, ), structs_gpu.data, float_data_gpu, int_data_gpu) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) print('int_data[0]', int_data[0]) print('int_data[1]', int_data[1]) assert float_data[0] == 567 assert float_data[1] == 44 assert int_data[0] == 123
def test_float4(cuSourcecode, context, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) kernelName = test_common.mangle('testFloat4', ['float4 *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) assert float_data[1] == float_data_orig[4 + 2] * float_data_orig[4 + 3]
def test_use_tid2(testcudakernel1, q, int_data, int_data_gpu): int_data_orig = np.copy(int_data) testcudakernel1.__getattr__(test_common.mangle('use_tid2', ['int *']))(q, (32, ), (32, ), int_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert int_data[0] == int_data_orig[0] + 0 assert int_data[10] == int_data_orig[10] + 10 assert int_data[31] == int_data_orig[31] + 31
def test_float4_test2(testcudakernel1, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) testcudakernel1.__getattr__( test_common.mangle('testFloat4_test2', ['float4 *']))(q, (32, ), (32, ), float_data_gpu) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) for i in range(4): assert float_data[i] == float_data_orig[i + 4]
def test_structs(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ struct MyStruct { int myint; float myfloat; }; __global__ void testStructs(MyStruct *structs, float *float_data, int *int_data) { int_data[0] = structs[0].myint; float_data[0] = structs[0].myfloat; float_data[1] = structs[1].myfloat; } """ kernel = test_common.compile_code_v3( cl, context, code, test_common.mangle('testStructs', ['MyStruct *', 'float *', 'int *']), num_clmems=3)['kernel'] # my_struct = np.dtype([("myfloat", np.float32), ("myint", np.int32)]) # I dont know why, but seems these are back to front... my_struct = np.dtype([ ("myint", np.int32), ("myfloat", np.float32) ]) # seems these are wrong way around on HD5500. Works ok on 940M my_struct, my_struct_c_decl = pyopencl.tools.match_dtype_to_c_struct( context.devices[0], "MyStruct", my_struct) my_struct = cl.tools.get_or_register_dtype("MyStruct", my_struct) structs = np.empty(2, my_struct) structs[0]['myint'] = 123 structs[0]['myfloat'] = 567 structs[1]['myint'] = 33 structs[1]['myfloat'] = 44 structs_gpu = cl.array.to_device(q, structs) # p = structs_gpu.map_to_host(q) # print('p', p) # q.finish() kernel(q, (32, ), (32, ), structs_gpu.data, offset_type(0), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), offset_type(0), offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[0]', int_data[0]) print('int_data[1]', int_data[1]) print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) assert int_data[0] == 123 assert float_data[0] == 567 assert float_data[1] == 44
def dotdotdot_cl(): # lets check it's compileable ll first, using llvm ll_filepath = 'test/dotdotdot.ll' with open(ll_filepath, 'r') as f: ll_sourcecode = f.read() kernelName = test_common.mangle('test_si', ['float *']) cl_sourcecode = test_common.ll_to_cl(ll_sourcecode, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) return cl_sourcecode
def test_use_template1(testcudakernel1, q, int_data, int_data_gpu, float_data, float_data_gpu): float_data_orig = np.copy(float_data) int_data_orig = np.copy(int_data) testcudakernel1.__getattr__( test_common.mangle('use_template1', ['float *', 'int *']))(q, (32, ), (32, ), float_data_gpu, int_data_gpu) cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert float_data[0] == float_data_orig[1] + float_data_orig[2] assert int_data[0] == int_data_orig[1] + int_data_orig[2]
def test_ternary(testcudakernel1, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) def set_float_value(gpu_buffer, idx, value): testcudakernel1.__getattr__( test_common.mangle('setValue', ['float *', 'int', 'float']))(q, (32, ), (32, ), float_data_gpu, np.int32(idx), np.float32(value)) set_float_value(float_data_gpu, 1, 10) testcudakernel1.__getattr__(test_common.mangle( 'testTernary', ['float *']))(q, (32, ), (32, ), float_data_gpu) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data_orig[2] set_float_value(float_data_gpu, 1, -2) testcudakernel1.__getattr__(test_common.mangle( 'testTernary', ['float *']))(q, (32, ), (32, ), float_data_gpu) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data_orig[3]