def end(self, branch = True): """Do post-loop iterator code""" p = self.code.prgm.acquire_register('pred') if self.mode == DEC: if self._external_stop: self.code.add(ptx.setp('gt', p, self.r_count, self.r_stop)) else: self.code.add(ptx.setp('gt', p, self.r_count, self.n_stop)) elif self.mode == INC: if self._external_stop: self.code.add(ptx.setp('lt', p, self.r_count, self.r_stop)) else: self.code.add(ptx.setp('lt', p, self.r_count, self.n_stop)) self.code.add(ptx.bra(self.start_label, pred=p)) # Reset the the current value in case this is a nested loop if self._external_start: self.code.add(ptx.mov(self.r_count, self.r_start)) else: self.code.add(ptx.mov(self.r_count, self.n_start)) # TODO: erm put this back in #for reg in self.get_acquired_registers(): # self.code.prgm.release_register(reg) return
def end(self, branch=True): """Do post-loop iterator code""" p = self.code.prgm.acquire_register('pred') if self.mode == DEC: if self._external_stop: self.code.add(ptx.setp('gt', p, self.r_count, self.r_stop)) else: self.code.add(ptx.setp('gt', p, self.r_count, self.n_stop)) elif self.mode == INC: if self._external_stop: self.code.add(ptx.setp('lt', p, self.r_count, self.r_stop)) else: self.code.add(ptx.setp('lt', p, self.r_count, self.n_stop)) self.code.add(ptx.bra(self.start_label, pred=p)) # Reset the the current value in case this is a nested loop if self._external_start: self.code.add(ptx.mov(self.r_count, self.r_start)) else: self.code.add(ptx.mov(self.r_count, self.n_start)) # TODO: erm put this back in #for reg in self.get_acquired_registers(): # self.code.prgm.release_register(reg) return
def TestSynIterInc(): SIZE = 64 # build and run the kernel prgm = env.Program() code = prgm.get_stream() code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos)) ones = prgm.acquire_register((1, 1, 1, 1)) counter = prgm.acquire_register() code.add(ptx.mov(counter, ones)) for i in syn_iter(code, 4, step=1, mode=INC): code.add(ptx.iadd(counter, counter, ones)) code.add(ptx.mov(reg.o0, counter.x)) domain = (0, 0, SIZE, SIZE) proc = env.Processor(0) ext_output = proc.alloc_remote('i', 1, SIZE) prgm.set_binding(reg.o0, ext_output) prgm.add(code) proc.execute(prgm, domain) passed = True for i in xrange(0, SIZE): if ext_output[i] != 5: passed = False print "Passed == ", passed proc.free(ext_output) return
def TestSynIterInc(): SIZE = 64 # build and run the kernel prgm = env.Program() code = prgm.get_stream() code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos)) ones = prgm.acquire_register((1, 1, 1, 1)) counter = prgm.acquire_register() code.add(ptx.mov(counter, ones)) for i in syn_iter(code, 4, step=1, mode=INC): code.add(ptx.iadd(counter, counter, ones)) code.add(ptx.mov(reg.o0, counter.x)) domain = (0, 0, SIZE, SIZE) proc = env.Processor(0) ext_output=proc.alloc_remote('i', 1, SIZE) prgm.set_binding(reg.o0, ext_output) prgm.add(code) proc.execute(prgm, domain) passed = True for i in xrange(0, SIZE): if ext_output[i] != 5: passed = False print "Passed == ", passed proc.free(ext_output) return
def start(self, align = True, branch = True): """Do pre-loop iteration initialization""" if self.r_count is None: self.r_count = self.code.prgm.acquire_register(self.type) if self._external_start == False: self.code.add(ptx.mov(self.r_count, self.n_start)) else: self.code.add(ptx.mov(self.r_count, self.r_start)) self.start_label = self.code.prgm.get_unique_label("SYN_ITER_START") self.code.add(self.start_label) return
def start(self, align=True, branch=True): """Do pre-loop iteration initialization""" if self.r_count is None: self.r_count = self.code.prgm.acquire_register(self.type) if self._external_start == False: self.code.add(ptx.mov(self.r_count, self.n_start)) else: self.code.add(ptx.mov(self.r_count, self.r_start)) self.start_label = self.code.prgm.get_unique_label("SYN_ITER_START") self.code.add(self.start_label) return
def TestParamsFull(): import time import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() a = 1.0 b = 2.0 ptx_mem_addr = proc.alloc_device("f32", 1) mem = extarray.extarray("f", 1) mem[0] = 5.0 param_list = [ptx_mem_addr.address, a, b] proc.copy(ptx_mem_addr, mem) prgm.cache_code() for i in range(20): t1 = time.time() proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() print "run time", t2 - t1 print "#####" print "X", mem.buffer_info()[0], ptx_mem_addr.address proc.copy(mem, ptx_mem_addr) print param_list print mem return
def TestParamsFull(): import time import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() a = 1.0 b = 2.0 ptx_mem_addr = proc.alloc_device('f32', 1) mem = extarray.extarray('f', 1) mem[0] = 5.0 param_list = [ptx_mem_addr.address, a, b] proc.copy(ptx_mem_addr, mem) prgm.cache_code() for i in range(20): t1 = time.time() proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() print "run time", t2 - t1 print "#####" print "X", mem.buffer_info()[0], ptx_mem_addr.address proc.copy(mem, ptx_mem_addr) print param_list print mem return
def TestSynIterIncFloatExtStopExtStart(): SIZE = 64 # build and run the kernel prgm = env.Program() code = prgm.get_stream() code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos)) ones = prgm.acquire_register((1, 1, 1, 1)) counter = prgm.acquire_register() code.add(ptx.mov(counter, ones)) stop = prgm.acquire_register((4.0, 4.0, 4.0, 4.0)) start = prgm.acquire_register((2.0, 2.0, 2.0, 2.0)) step = prgm.acquire_register((1.0, 1.0, 1.0, 1.0)) fiter = syn_iter_float(code, stop, step=step, mode=INC) fiter.set_start_reg(start) for i in fiter: code.add(ptx.iadd(counter, counter, ones)) code.add(ptx.mov(reg.o0, counter.x)) domain = (0, 0, SIZE, SIZE) proc = env.Processor(0) ext_output = proc.alloc_remote('i', 1, SIZE, 1) prgm.set_binding(reg.o0, ext_output) prgm.add(code) proc.execute(prgm, domain) passed = True for i in xrange(0, SIZE): if ext_output[i] != 3: passed = False print "Passed == ", passed proc.free(ext_output) return
def TestSynIterIncFloatExtStopExtStart(): SIZE = 64 # build and run the kernel prgm = env.Program() code = prgm.get_stream() code.add(ptx.dcl_output(reg.o0, USAGE=ptx.usage.pos)) ones = prgm.acquire_register((1, 1, 1, 1)) counter = prgm.acquire_register() code.add(ptx.mov(counter, ones)) stop = prgm.acquire_register((4.0, 4.0, 4.0, 4.0)) start = prgm.acquire_register((2.0, 2.0, 2.0, 2.0)) step = prgm.acquire_register((1.0, 1.0, 1.0, 1.0)) fiter = syn_iter_float(code, stop, step=step, mode=INC) fiter.set_start_reg(start) for i in fiter: code.add(ptx.iadd(counter, counter, ones)) code.add(ptx.mov(reg.o0, counter.x)) domain = (0, 0, SIZE, SIZE) proc = env.Processor(0) ext_output=proc.alloc_remote('i', 1, SIZE, 1) prgm.set_binding(reg.o0, ext_output) prgm.add(code) proc.execute(prgm, domain) passed = True for i in xrange(0, SIZE): if ext_output[i] != 3: passed = False print "Passed == ", passed proc.free(ext_output) return
def TestSynIterDec(): import corepy.arch.ptx.isa as ptx import corepy.arch.ptx.types.registers as regs SIZE = 64 proc = env.Processor(0) # build and run the kernel prgm = env.Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') memp = prgm.acquire_register('u64') counter = prgm.acquire_register('u32') code.add(ptx.ld('param', memp, regs.ptxAddress(_mem))) code.add(ptx.mov(counter, 0)) for i in syn_iter(code, 5, step=1, mode=DEC): code.add(ptx.add(counter, counter, 1)) code.add(ptx.st('global', regs.ptxAddress(memp), counter)) prgm.add(code) ptx_mem_addr = proc.alloc_device('u32', 1) mem = extarray.extarray('I', 1) mem[0] = 5 param_list = [ ptx_mem_addr.address, ] proc.copy(ptx_mem_addr, mem) prgm.cache_code() print prgm.render_string proc.execute(prgm, (1, 1, 1, 1, 1), param_list) proc.copy(mem, ptx_mem_addr) print mem #passed = True #for i in xrange(0, SIZE): # if ext_output[i] != 5: # passed = False #print "Passed == ", passed return
def TestSynIterDec(): import corepy.arch.ptx.isa as ptx import corepy.arch.ptx.types.registers as regs SIZE = 64 proc = env.Processor(0) # build and run the kernel prgm = env.Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') memp = prgm.acquire_register('u64') counter = prgm.acquire_register('u32') code.add(ptx.ld('param', memp, regs.ptxAddress(_mem))) code.add(ptx.mov(counter, 0)) for i in syn_iter(code, 5, step=1, mode=DEC): code.add(ptx.add(counter, counter, 1)) code.add(ptx.st('global', regs.ptxAddress(memp), counter)) prgm.add(code) ptx_mem_addr = proc.alloc_device('u32', 1) mem = extarray.extarray('I', 1) mem[0] = 5 param_list = [ptx_mem_addr.address,] proc.copy(ptx_mem_addr, mem) prgm.cache_code() print prgm.render_string proc.execute(prgm, (1, 1, 1, 1, 1), param_list) proc.copy(mem, ptx_mem_addr) print mem #passed = True #for i in xrange(0, SIZE): # if ext_output[i] != 5: # passed = False #print "Passed == ", passed return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable # import pdb # pdb.set_trace() # code.add(isa.add(r3, r2, r1)) # code.add('add.f32 r3, r2, r1;') code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) # temp = prgm.acquire_register('u32') # code.add(isa.cvt(temp, regs.tid.x)) # code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register("u32") temp2 = prgm.acquire_register("u32") temp3 = prgm.acquire_register("u32") code.add(isa.mul(temp2, temp1, temp3, hlw="lo")) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### # ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) # import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) # kernel = prgm.render_string # module = ptx_exec.compile(kernel) t1 = time.time() # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
# self.params['rnd'] = koperands['rnd'] # return True if __name__ == '__main__': import corepy.arch.ptx.isa as isa #import corepy.arch.ptx.platform as env #code = env.InstructionStream() #set_active_code(code) r1 = regs.ptxVariable('reg', 'u32', 'r1') r2 = regs.ptxVariable('reg', 'u32', 'r2') r3 = regs.ptxVariable('reg', 'u32', 'r3') r4 = regs.ptxVariable('reg', 'u32', 'r4') #x = add(r3, r2, r1, ignore_active = True) x = isa.add(r3, r2, r1) print x.render() y = isa.mov(r2, r1) print y.render() a = regs.ptxAddress(r4) z = isa.ld('param', r1, a) print z.render() f1 = regs.ptxVariable('reg', 'f32', 'f1') f2 = regs.ptxVariable('reg', 'f32', 'f2') f3 = regs.ptxVariable('reg', 'f32', 'f3') a = isa.add(f3, f2, f1) print a.render()
# return True if __name__ == '__main__': import corepy.arch.ptx.isa as isa #import corepy.arch.ptx.platform as env #code = env.InstructionStream() #set_active_code(code) r1 = regs.ptxVariable('reg', 'u32', 'r1') r2 = regs.ptxVariable('reg', 'u32', 'r2') r3 = regs.ptxVariable('reg', 'u32', 'r3') r4 = regs.ptxVariable('reg', 'u32', 'r4') #x = add(r3, r2, r1, ignore_active = True) x = isa.add(r3, r2, r1) print x.render() y = isa.mov(r2, r1) print y.render() a = regs.ptxAddress(r4) z = isa.ld('param', r1, a) print z.render() f1 = regs.ptxVariable('reg', 'f32', 'f1') f2 = regs.ptxVariable('reg', 'f32', 'f2') f3 = regs.ptxVariable('reg', 'f32', 'f3') a = isa.add(f3, f2, f1) print a.render()
def copy_register(self, other): return self.code.add(ptx.mov(self, other))
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable # import pdb # pdb.set_trace() #code.add(isa.add(r3, r2, r1)) #code.add('add.f32 r3, r2, r1;') code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) #temp = prgm.acquire_register('u32') #code.add(isa.cvt(temp, regs.tid.x)) #code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register('u32') temp2 = prgm.acquire_register('u32') temp3 = prgm.acquire_register('u32') code.add(isa.mul(temp2, temp1, temp3, hlw='lo')) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### #ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) #import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) #kernel = prgm.render_string #module = ptx_exec.compile(kernel) t1 = time.time() #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1,1,1,1,1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return