def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name=13) r20 = prgm.acquire_register(reg_name=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name = 13) r20 = prgm.acquire_register(reg_name = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop = True) # , debug = True) #print 'int result:', r assert(r[0] == 0) assert(r[1] == 0x200D) return
def execute(self, code, mode='int', debug=False, params=None, n_spus=1): if type(code) is ParallelInstructionStream: raise Exception( 'DebugProcessor does not support ParallelInstructionStream') self.code = code if len(code) == 0: return None # Add the debug instructions - two each for normal instructions and branch targets self.debug_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_target_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) self.debug_target_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) # Cache the code here if not code._cached: code.cache_code() # Setup the parameter structure if params is None: params = spu_exec.ExecParams() addr = code._prologue.inst_addr() params.addr = addr params.size = len(code.render_code) * 4 # size in bytes self.params = params self.ea = code._prologue.inst_addr() self.lsa = (0x3FFFF - params.size) & 0xFFF80 self.size = params.size + (16 - params.size % 16) self.last_pc = self.lsa self.last_stop = (1, ) self.debug_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_idx * 4) >> 2 self.debug_target_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_target_idx * 4) >> 2 mode = 'async' # TODO: Factor replacing into one function in case the first one is a branch self.replace(self.last_stop[0], spu.bra(self.debug_lsa, ignore_active=True)) self.spe_id = spe.Processor.execute(self, code, mode, debug, params) code.print_code() retval = self.wait_debug() return retval
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def test(self, cmp, count_var): code = spu.get_active_code() self._branch_idx = len(code) spu.stop(0xB) # spu.nop(0) self._cmp = cmp self._count = count_var return
def TestParams(): # Run this with a stop instruction and examine the registers prgm = Program() code = prgm.get_stream() proc = Processor() # r_sum = code.acquire_register(reg = 1) r_sum = prgm.gp_return r_current = prgm.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10, ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) # code.add(spu.ori(code.gp_return, r_current, 0)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 prgm += code r = proc.execute(prgm, params=params, stop=True) assert r[0] == 55 assert r[1] == 0x200A # print 'int result:', r return
def execute(self, code, mode = 'int', debug = False, params = None, n_spus = 1): if type(code) is ParallelInstructionStream: raise Exception('DebugProcessor does not support ParallelInstructionStream') self.code = code if len(code) == 0: return None # Add the debug instructions - two each for normal instructions and branch targets self.debug_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_target_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) self.debug_target_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) # Cache the code here if not code._cached: code.cache_code() # Setup the parameter structure if params is None: params = spu_exec.ExecParams() addr = code._prologue.inst_addr() params.addr = addr params.size = len(code.render_code) * 4 # size in bytes self.params = params self.ea = code._prologue.inst_addr() self.lsa = (0x3FFFF - params.size) & 0xFFF80; self.size = params.size + (16 - params.size % 16); self.last_pc = self.lsa self.last_stop = (1,) self.debug_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_idx * 4) >> 2 self.debug_target_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_target_idx * 4) >> 2 mode = 'async' # TODO: Factor replacing into one function in case the first one is a branch self.replace(self.last_stop[0], spu.bra(self.debug_lsa, ignore_active = True)) self.spe_id = spe.Processor.execute(self, code, mode, debug, params) code.print_code() retval = self.wait_debug() return retval
def test(self, cmp, score, x_off, y_off): code = spu.get_active_code() self._branch_idx = len(code) spu.stop(0xB) # spu.nop(0) self._cmp = cmp self._score = score self._x_off = x_off self._y_off = y_off return
def TestParams(): # Run this with a stop instruction and examine the registers prgm = Program() code = prgm.get_stream() proc = Processor() #r_sum = code.acquire_register(reg = 1) r_sum = prgm.gp_return r_current = prgm.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10 ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) #code.add(spu.ori(code.gp_return, r_current, 0)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 prgm += code r = proc.execute(prgm, params=params, stop=True) assert (r[0] == 55) assert (r[1] == 0x200A) # print 'int result:', r return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert(r == None) print 'int result:', r # while True: # pass return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert (r == None) print 'int result:', r # while True: # pass return
def _synthesize_epilogue(self): """ Add a stop signal with return type 0x2000 (EXIT_SUCCESS) to the instruction stream epilogue. (BE Handbook, p. 422). """ self._epilogue = [self.lbl_epilogue, spu.stop(0x2000, ignore_active=True)] return
def cache_code(self): """ Add a stop signal with return type 0x2000 (EXIT_SUCCESS) to the end if the instruction stream. (BE Handbook, p. 422). """ # Generate the prologue self._synthesize_prologue() # Don't have a real epilogue. self.add(spu.stop(0x2000)) # self._check_alignment(self._code, 'spu code') # self.exec_module.make_executable(self._code.buffer_info()[0], len(self._code)) # Append our instructions to the prologue's, first making sure the alignment is correct. if len(self._prologue._code) % 2 == 1: # Odd number of instructions self._prologue.add(spu.lnop(0)) self._prologue._code.extend(self._code) self._prologue._check_alignment(self._prologue._code, 'spu prologue') self._epilogue = self self._cached = True return
def TestParams(): # Run this with a stop instruction and examine the registers code = InstructionStream() proc = Processor() # code.add(spu.stop(0xA)) code.add(spu.stop(0x200D)) params = ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 r = proc.execute(code, params = params) # print 'int result:', r # while True: # pass return
def TestParams(): # Run this with a stop instruction and examine the registers code = InstructionStream() proc = Processor() # code.add(spu.stop(0xA)) code.add(spu.stop(0x200D)) params = ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 r = proc.execute(code, params=params) # print 'int result:', r # while True: # pass return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16); print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80; self.lbl_op = lbl_op return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16) print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80 self.lbl_op = lbl_op return
def _synthesize_epilogue(self): """ Add a stop signal with return type 0x2000 (EXIT_SUCCESS) to the instruction stream epilogue. (BE Handbook, p. 422). """ self._epilogue = [self.lbl_epilogue] self._epilogue.append(spu.stop(0x2000, ignore_active=True)) return
def TestParams(): # Run this with a stop instruction and examine the registers code = InstructionStream() proc = Processor() r_sum = code.acquire_register() r_current = code.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10 ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 r = proc.execute(code, params=params) assert (r == 0xA) # print 'int result:', r # while True: # pass return
def TestInt2(i0 = 0, i1 = 1): i2 = i0 + i1 i3 = i1 + i2 code = InstructionStream() proc = Processor() r_loop = 4 r_address = 5 r0 = 6 r1 = 7 r2 = 8 r3 = 9 # Load arguments into a quadword ################# # Pack quadword # ################# def load_value_int32(code, reg, value, clear = False): # obviously, value should be 32 bit integer code.add(spu.ilhu(reg, value / pow(2, 16))) # immediate load halfword upper code.add(spu.iohl(reg, value % pow(2, 16))) # immediate or halfword lower if clear: code.add(spu.shlqbyi(reg, reg, 12)) # shift left qw by bytes, clears right bytes return load_value_int32(code, r0, i0, True) load_value_int32(code, r1, i1, True) code.add(spu.rotqbyi(r1, r1, 12)) # rotate qw by bytes load_value_int32(code, r2, i2, True) code.add(spu.rotqbyi(r2, r2, 8)) load_value_int32(code, r3, i3, True) code.add(spu.rotqbyi(r3, r3, 4)) code.add(spu.a(r0, r0, r1)) code.add(spu.a(r0, r0, r2)) code.add(spu.a(r0, r0, r3)) ########## # Main loop to calculate Fibnoccai sequence load_value_int32(code, r_address, pow(2, 16), clear_bits = False) # start at 64K load_value_int32(code, r_loop, 0, clear_bits = False) start_label = code.size() + 1 code.add(spu.sfi(r_loop, r_loop, 1)) code.add(spu.brnz(r_loop, (-(next - start_label) * spu.WORD_SIZE))) code.add(spu.stop(0x2005)) r = proc.execute(code) # assert(r == 12) # print 'int result:', r return
def GenerateStream(self, step=None): prgm = env.Program() code = prgm.get_stream() txt = self.editCtrl.GetText().split('\n') txtlen = len(txt) for i in xrange(0, txtlen): # For the stop case, want all instructions except the current one to be # STOP instructions. cmd = txt[i].strip() if step != None and i != step: if cmd == "" or cmd[0] == '#': continue if cmd[-1] == ":": # Label - better parsing? #code.add(spe.Label(cmd[:-1])) code.add(code.prgm.get_label(cmd[:-1])) else: code.add(spu.stop(0x2FFF)) continue if self.editCtrl.IsBreakSet(i): code.add(spu.stop(0x2FFF)) continue if cmd != "" and cmd[0] != '#': inst = None if cmd[-1] == ":": # Label - better parsing? #inst = spe.Label(cmd[:-1]) inst = code.prgm.get_label(cmd[:-1]) else: # Instruction strcmd = re.sub("Label\((.*?)\)", "code.prgm.get_label('\\1')", cmd) try: inst = eval('spu.%s' % strcmd) except: print 'Error creating instruction: %s' % cmd code.add(inst) prgm.add(code) prgm.cache_code() return code
def GenerateStream(self, step = None): prgm = env.Program() code = prgm.get_stream() txt = self.editCtrl.GetText().split('\n') txtlen = len(txt) for i in xrange(0, txtlen): # For the stop case, want all instructions except the current one to be # STOP instructions. cmd = txt[i].strip() if step != None and i != step: if cmd == "" or cmd[0] == '#': continue if cmd[-1] == ":": # Label - better parsing? #code.add(spe.Label(cmd[:-1])) code.add(code.prgm.get_label(cmd[:-1])) else: code.add(spu.stop(0x2FFF)) continue if self.editCtrl.IsBreakSet(i): code.add(spu.stop(0x2FFF)) continue if cmd != "" and cmd[0] != '#': inst = None if cmd[-1] == ":": # Label - better parsing? #inst = spe.Label(cmd[:-1]) inst = code.prgm.get_label(cmd[:-1]) else: # Instruction strcmd = re.sub("Label\((.*?)\)", "code.prgm.get_label('\\1')", cmd) try: inst = eval('spu.%s' % strcmd) except: print 'Error creating instruction: %s' % cmd code.add(inst) prgm.add(code) prgm.cache_code() return code
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name=55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='fp') print r return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name = 55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'int', stop = True, debug = True) assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'fp') print r return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert r == 0xD return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert (r == 0xD) return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) r = proc.execute(code, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg = 13) r20 = code.acquire_register(reg = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory code = ParallelInstructionStream() proc = Processor() code.raw_data_size = 128 * 8 r = code.acquire_register() code.add(spu.ai(r, r, 0xCAFE)) code.add(spu.ai(r, r, 0xBABE)) code.add(spu.stop(0x2000)) r = proc.execute(code, mode='async', n_spus=6) for speid in r: proc.join(speid) assert (True) return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory code = ParallelInstructionStream() proc = Processor() code.raw_data_size = 128*8 r = code.acquire_register() code.add(spu.ai(r, r, 0xCAFE)) code.add(spu.ai(r, r, 0xBABE)) code.add(spu.stop(0x2000)) r = proc.execute(code, mode='async', n_spus = 6) for speid in r: proc.join(speid) assert(True) return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory prgm = ParallelProgram() code = prgm.get_stream() proc = Processor() code.raw_data_size = 128 * 8 r = prgm.acquire_register() code.add(spu.ai(r, r, 0x2FE)) code.add(spu.ai(r, r, 0x2BE)) code.add(spu.stop(0x1FFF)) prgm += code r = proc.execute(prgm, async=True, mode='void', n_spus=6) for speid in r: proc.join(speid) assert (True) return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory prgm = ParallelProgram() code = prgm.get_stream() proc = Processor() code.raw_data_size = 128*8 r = prgm.acquire_register() code.add(spu.ai(r, r, 0x2FE)) code.add(spu.ai(r, r, 0x2BE)) code.add(spu.stop(0x1FFF)) prgm += code r = proc.execute(prgm, async = True, mode='void', n_spus = 6) for speid in r: proc.join(speid) assert(True) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = prgm.gp_return test = prgm.acquire_register() lbl_brz = prgm.get_label("BRZ") lbl_skip = prgm.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = prgm.acquire_register() r_stop = prgm.acquire_register() r_cmp = prgm.acquire_register() r_foo = prgm.gp_return spu.ori(r_foo, prgm.r_zero, 0) spu.ori(r_cnt, prgm.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 55) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active = True).render() save_code[i + 1] = spu.stop(0x6, ignore_active = True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active = True).render() ret = spu.bra(self.debug_lsa, ignore_active = True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag); # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active = True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs
def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active=True).render() save_code[i + 1] = spu.stop(0x6, ignore_active=True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active=True).render() ret = spu.bra(self.debug_lsa, ignore_active=True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag) # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active=True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
class DebugProcessor(spe.Processor): """ Experimental class for simple debugging. """ exec_module = spu_exec debug_stop = spu.stop(DEBUG_STOP, ignore_active=True) def __init__(self): spe.Processor.__init__(self) self.params = None self.spe_id = None self.code = None self.ea = None self.lsa = None self.inst_size = None self.last_pc = None self.last_stop = None self.stop_code = None self.instructions = {} # key: inst, backup copy of we've replaced return def execute(self, code, mode='int', debug=False, params=None, n_spus=1): if type(code) is ParallelInstructionStream: raise Exception( 'DebugProcessor does not support ParallelInstructionStream') self.code = code if len(code) == 0: return None # Add the debug instructions - two each for normal instructions and branch targets self.debug_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP)) self.debug_target_idx = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) self.debug_target_branch = self.code.size() self.code.add(spu.stop(DEBUG_STOP_TARGET)) # Cache the code here if not code._cached: code.cache_code() # Setup the parameter structure if params is None: params = spu_exec.ExecParams() addr = code._prologue.inst_addr() params.addr = addr params.size = len(code.render_code) * 4 # size in bytes self.params = params self.ea = code._prologue.inst_addr() self.lsa = (0x3FFFF - params.size) & 0xFFF80 self.size = params.size + (16 - params.size % 16) self.last_pc = self.lsa self.last_stop = (1, ) self.debug_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_idx * 4) >> 2 self.debug_target_lsa = (self.lsa + self.code.code_offset * 4 + self.debug_target_idx * 4) >> 2 mode = 'async' # TODO: Factor replacing into one function in case the first one is a branch self.replace(self.last_stop[0], spu.bra(self.debug_lsa, ignore_active=True)) self.spe_id = spe.Processor.execute(self, code, mode, debug, params) code.print_code() retval = self.wait_debug() return retval def replace(self, idx, inst): self.instructions[idx] = self.code[idx] self.code.debug_set(idx, inst) return def restore(self, idx): """ Restore the function at idx and return a reference to the instruction """ # self.code._prologue._code[idx] = self.instructions[idx] self.code.debug_set(idx, self.instructions[idx]) return self.code[idx] def get_instructions(self): # return spe_mfc_getb(speid, ls, (void *)ea, size, tag, tid, rid); tag = 5 ea = self.code._prologue.inst_addr() spu_exec.spu_getb(self.spe_id, self.lsa, ea, self.size, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag) return def wait_debug(self): r = spu_exec.wait_stop_event(self.spe_id) if r not in (DEBUG_STOP, DEBUG_STOP_TARGET): print 'Warning: SPU stopped for unknown reason:', r else: print 'Debug stop: 0x%X' % r return r def nexti(self): if len(self.last_stop) == 1: # Restore a single instruction current_inst = self.restore(self.last_stop[0]) last_idx = self.last_stop[0] else: # Restore two branch targets and determine which branch was taken # based on the stop code i1 = self.restore(self.last_stop[0]) i2 = self.restore(self.last_stop[1]) if self.stop_code == DEBUG_STOP: current_inst = i1 last_idx = self.last_stop[0] else: current_inst = i2 last_idx = self.last_stop[1] # If the current instruction is a branch, get the location # of all possible next instructions if isinstance(current_inst, (spu.br, spu.brsl)): next_stop = (self.last_stop[0] + current_inst.I16, ) print 'next br:', next_stop elif isinstance(current_inst, (spu.bra, spu.brasl)): next_stop = (current_inst.I16 - (self.lsa >> 2), ) elif isinstance(current_inst, (spu.brnz, spu.brz, spu.brhnz, spu.brhz)): next_stop = (self.last_stop[0] + 1, self.last_stop[0] + current_inst.I16) elif isinstance(current_inst, (spu.bi, spu.bisled, spu.bisl)): raise Exception( "DebugProcessor does not support branch indirect (bi) instructions" ) else: next_stop = (self.last_stop[0] + 1, ) # TODO: Get rid of last instruction. Do something smarter. last_instruction = (next_stop[0] == (self.debug_idx - 1)) # !!! STOPPED HERE !!! # !!! STILL WRONG !!! if not last_instruction: # Normal instructions and single target branches self.replace(next_stop[0], spu.bra(self.debug_lsa, ignore_active=True)) print 'target (1):', -(self.debug_lsa - ((self.lsa >> 2) + next_stop[0]) ), self.debug_lsa, last_idx, self.lsa self.replace( self.debug_branch, spu.br(-(self.debug_lsa - ((self.lsa >> 2) + next_stop[0])), ignore_active=True)) # Branch target for test-based branch instructions if len(next_stop) == 2: self.replace( next_stop[1], spu.bra(self.debug_target_lsa, ignore_active=True)) print 'target (2):', -(self.debug_target_lsa - ( (self.lsa >> 2) + next_stop[1])), self.debug_target_lsa self.replace( self.debug_target_branch, spu.br(-(self.debug_target_lsa - ((self.lsa >> 2) + next_stop[1])), ignore_active=True)) # self.replace(next_stop, self.debug_stop) self.get_instructions() self.code.print_code() self.resume(self.spe_id) if last_instruction: r = self.join(self.spe_id) r = None else: r = self.wait_debug() self.last_stop = next_stop self.stop_code = r return r def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active=True).render() save_code[i + 1] = spu.stop(0x6, ignore_active=True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active=True).render() ret = spu.bra(self.debug_lsa, ignore_active=True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag) # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active=True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs def dump_mem(self): # Use putb to copy the local store to Python array return