def TestParams(): import time # SIZE = 1024 kernel = """ .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff """ t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) # mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem # ptx_exec.free(input) # ptx_exec.free(output) ##ptx_exec.free(glob) # ptx_exec.unload_module(image) return
def cache_code(self): if self._cached == True: return self._synthesize_prologue() self._synthesize_epilogue() render_string = '' for stream in self.objects: render_string = self._cache_code_S(render_string, stream.objects) self.render_string = self._prologue + render_string + self._epilogue #print self.render_string self.render_code = ptx_exec.compile(self.render_string) self._cached = True return
def cache_code(self): if self._cached == True: return self._synthesize_prologue() self._synthesize_epilogue() render_string = "" for stream in self.objects: render_string = self._cache_code_S(render_string, stream.objects) self.render_string = self._prologue + render_string + self._epilogue # print self.render_string self.render_code = ptx_exec.compile(self.render_string) self._cached = True return
def TestCompileExec(): import time # SIZE = 1024 kernel = ( "\t.version 1.4\n" + "\t.target sm_10, map_f64_to_f32\n" + "\t.entry _main () {\n" + "\t\tret;\n" + "\t\texit;\n" + "\t}\n" + "\n" ) print kernel # ctx = ptx_exec.alloc_ctx(0) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 # input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) # output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER) # print "input", input # print "output", output # remote = {"o0": output, "i0": input} # local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4), # "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)} # domain = (0, 0, SIZE, SIZE) # print "remote bindings", remote # print "local bindings", local print "Executing..." # image, dev num, (x, y, w, h) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), []) t2 = time.time() print "run time", t2 - t1 # ptx_exec.free_ctx(ctx) return
def TestCompileExec(): import time #SIZE = 1024 kernel = ("\t.version 1.4\n" + "\t.target sm_10, map_f64_to_f32\n" + "\t.entry _main () {\n" + "\t\tret;\n" + "\t\texit;\n" + "\t}\n" + "\n") print kernel #ctx = ptx_exec.alloc_ctx(0) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 #input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) #output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER) #print "input", input #print "output", output #remote = {"o0": output, "i0": input} #local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4), # "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)} #domain = (0, 0, SIZE, SIZE) #print "remote bindings", remote #print "local bindings", local print "Executing..." # image, dev num, (x, y, w, h) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), []) t2 = time.time() print "run time", t2 - t1 #ptx_exec.free_ctx(ctx) return
def TestParams(): import time #SIZE = 1024 kernel = ( ''' .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff ''' ) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) #mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #ptx_exec.free(input) #ptx_exec.free(output) ##ptx_exec.free(glob) #ptx_exec.unload_module(image) return