def TestParams(): import time # SIZE = 1024 kernel = """ .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff """ t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) # mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem # ptx_exec.free(input) # ptx_exec.free(output) ##ptx_exec.free(glob) # ptx_exec.unload_module(image) return
def TestCompileExec(): import time # SIZE = 1024 kernel = ( "\t.version 1.4\n" + "\t.target sm_10, map_f64_to_f32\n" + "\t.entry _main () {\n" + "\t\tret;\n" + "\t\texit;\n" + "\t}\n" + "\n" ) print kernel # ctx = ptx_exec.alloc_ctx(0) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 # input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) # output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER) # print "input", input # print "output", output # remote = {"o0": output, "i0": input} # local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4), # "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)} # domain = (0, 0, SIZE, SIZE) # print "remote bindings", remote # print "local bindings", local print "Executing..." # image, dev num, (x, y, w, h) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), []) t2 = time.time() print "run time", t2 - t1 # ptx_exec.free_ctx(ctx) return
def TestCompileExec(): import time #SIZE = 1024 kernel = ("\t.version 1.4\n" + "\t.target sm_10, map_f64_to_f32\n" + "\t.entry _main () {\n" + "\t\tret;\n" + "\t\texit;\n" + "\t}\n" + "\n") print kernel #ctx = ptx_exec.alloc_ctx(0) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 #input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) #output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER) #print "input", input #print "output", output #remote = {"o0": output, "i0": input} #local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4), # "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)} #domain = (0, 0, SIZE, SIZE) #print "remote bindings", remote #print "local bindings", local print "Executing..." # image, dev num, (x, y, w, h) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), []) t2 = time.time() print "run time", t2 - t1 #ptx_exec.free_ctx(ctx) return
def TestParams(): import time #SIZE = 1024 kernel = ( ''' .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff ''' ) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) #mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #ptx_exec.free(input) #ptx_exec.free(output) ##ptx_exec.free(glob) #ptx_exec.unload_module(image) return
#for i, param in enumerate(params): # pt = prgrm.param_types[i] # # if !isinstance(param, prgm.params[i]): # # raise Exception("Invalid parameter type at parameter " + str(i)) pass param_list = list(params) # Replace DeviceMemory parameters with their actual address # TODO - any other swaps than need to be done? for i in xrange(0, len(param_list)): if isinstance(param_list[i], DeviceMemory): param_list[i] = param_list[i].address type_num_tuple = tuple(map(ptx_exec.__dict__.__getitem__, prgm.param_types)) ptx_exec.run_stream(prgm.render_code, threads, type_num_tuple, param_list) # ptx_exec.run_stream(prgm.render_code, self.ctx, threads, tuple(prgm.param_types), param_list) return def join(self, hdl): # TODO - do something better to differentiate if len(hdl) == 2: # Join a kernel execution (th, prgm) = hdl ptx_exec.join_stream(th) for arr in prgm._remote_bindings_data.values(): binding = prgm._bindings[key] if isinstance(arr, extarray.extarray): arr.set_memory(bindings[1], arr.data_len * arr.itemsize)
# for i, param in enumerate(params): # pt = prgrm.param_types[i] # # if !isinstance(param, prgm.params[i]): # # raise Exception("Invalid parameter type at parameter " + str(i)) pass param_list = list(params) # Replace DeviceMemory parameters with their actual address # TODO - any other swaps than need to be done? for i in xrange(0, len(param_list)): if isinstance(param_list[i], DeviceMemory): param_list[i] = param_list[i].address type_num_tuple = tuple(map(ptx_exec.__dict__.__getitem__, prgm.param_types)) ptx_exec.run_stream(prgm.render_code, threads, type_num_tuple, param_list) # ptx_exec.run_stream(prgm.render_code, self.ctx, threads, tuple(prgm.param_types), param_list) return def join(self, hdl): # TODO - do something better to differentiate if len(hdl) == 2: # Join a kernel execution (th, prgm) = hdl ptx_exec.join_stream(th) for arr in prgm._remote_bindings_data.values(): binding = prgm._bindings[key] if isinstance(arr, extarray.extarray): arr.set_memory(bindings[1], arr.data_len * arr.itemsize)