def _update_inc_count(self): code = self.obj.code code.prgm.acquire_block_registers() r_block_size = code.prgm.r_block_size r_offset = code.prgm.r_offset # Determine the block size for each loop code.prgm.raw_data_size = self.get_count() - self.get_start() # synppc.load_word(code, r_block_size, self.get_count() - self.get_start()) # code.add(synppc.ppc.divw(r_block_size, r_block_size, code.r_size)) # Determine the offset for the current block and update the r_count # (this is primarily for range, which uses different values in r_count # to initialize ranges that don't start at 0) # code.add(synppc.ppc.mullw(r_offset, code.r_rank, r_block_size)) code.add(spu.a(self.obj.r_count, r_offset, self.obj.r_count)) # Offset is rank * block_size # Count is count + offset # Stop is count + block_size if self.obj.r_stop is not None: code.add(spu.a(self.obj.r_stop, r_block_size, self.obj.r_count)) # code.prgm.release_register(r_offset) # code.prgm.release_register(r_block_size) return
def vector_from_array(code, r_target, a): """ Generate the instructions to fill a vector register with the values from an array. """ prgm = code.prgm r0 = r_target r1 = prgm.acquire_register() r2 = prgm.acquire_register() r3 = prgm.acquire_register() load_word(code, r0, a[0], True) load_word(code, r1, a[1], True) code.add(spu.rotqbyi(r1, r1, 12)) # rotate qw by bytes load_word(code, r2, a[2], True) code.add(spu.rotqbyi(r2, r2, 8)) load_word(code, r3, a[3], True) code.add(spu.rotqbyi(r3, r3, 4)) code.add(spu.a(r0, r0, r1)) code.add(spu.a(r0, r0, r2)) code.add(spu.a(r0, r0, r3)) prgm.release_register(r1) prgm.release_register(r2) prgm.release_register(r3) return
def _ab(self, x, y, ab, temp): spu.xor(temp, x, y) spu.cntb(temp, temp) spu.sumb(temp, temp, 0) spu.a(ab, ab, temp) return
def _c(self, x, y, c, temp): spu.and_(temp, x, y) spu.cntb(temp, temp) spu.sumb(temp, temp, 0) spu.a(c, c, temp) return
def TestInt2(i0 = 0, i1 = 1): i2 = i0 + i1 i3 = i1 + i2 code = InstructionStream() proc = Processor() r_loop = 4 r_address = 5 r0 = 6 r1 = 7 r2 = 8 r3 = 9 # Load arguments into a quadword ################# # Pack quadword # ################# def load_value_int32(code, reg, value, clear = False): # obviously, value should be 32 bit integer code.add(spu.ilhu(reg, value / pow(2, 16))) # immediate load halfword upper code.add(spu.iohl(reg, value % pow(2, 16))) # immediate or halfword lower if clear: code.add(spu.shlqbyi(reg, reg, 12)) # shift left qw by bytes, clears right bytes return load_value_int32(code, r0, i0, True) load_value_int32(code, r1, i1, True) code.add(spu.rotqbyi(r1, r1, 12)) # rotate qw by bytes load_value_int32(code, r2, i2, True) code.add(spu.rotqbyi(r2, r2, 8)) load_value_int32(code, r3, i3, True) code.add(spu.rotqbyi(r3, r3, 4)) code.add(spu.a(r0, r0, r1)) code.add(spu.a(r0, r0, r2)) code.add(spu.a(r0, r0, r3)) ########## # Main loop to calculate Fibnoccai sequence load_value_int32(code, r_address, pow(2, 16), clear_bits = False) # start at 64K load_value_int32(code, r_loop, 0, clear_bits = False) start_label = code.size() + 1 code.add(spu.sfi(r_loop, r_loop, 1)) code.add(spu.brnz(r_loop, (-(next - start_label) * spu.WORD_SIZE))) code.add(spu.stop(0x2005)) r = proc.execute(code) # assert(r == 12) # print 'int result:', r return
def _reduce_word(self, words, result): """ Add-reduce a vector of words into the preferred slot of result. """ for i in range(4): spu.a(result, words, result) spu.rotqbyi(words, words, 4) return
def popc(self, count, x): """ Add the number of 1 bits in each word in X to the value in count. """ temp = spu.get_active_code().acquire_register() spu.cntb(temp, x) spu.sumb(temp, temp, 0) spu.a(count, count, temp) spu.get_active_code().release_register(temp) return
def _ab_c(self, x, y, ab, c, ab_temp, c_temp): """ Interleave ab and c computations """ spu.xor(ab_temp, x, y) spu.and_(c_temp, x, y) spu.cntb(ab_temp, ab_temp) spu.cntb(c_temp, c_temp) spu.sumb(ab_temp, ab_temp, 0) spu.sumb(c_temp, c_temp, 0) spu.a(ab, ab, ab_temp) spu.a(c, c, c_temp) return
def TestParams(): # Run this with a stop instruction and examine the registers prgm = Program() code = prgm.get_stream() proc = Processor() # r_sum = code.acquire_register(reg = 1) r_sum = prgm.gp_return r_current = prgm.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10, ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) # code.add(spu.ori(code.gp_return, r_current, 0)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 prgm += code r = proc.execute(prgm, params=params, stop=True) assert r[0] == 55 assert r[1] == 0x200A # print 'int result:', r return
def init_address(self): # Call syn_iters init self.code self.obj.init_address(self) # Update the address with the offset # For variable iterators, this is the value already computed for r_count self.obj.code.add(spu.a(self.r_addr, self.obj.r_count, self.r_addr)) return
def mem_write_in_mbox(code, psmap, lsa, tag, cache = False): """Write a 32bit message at a local LSA from this SPU to another. psmap must contain the base address of the target SPU's PS map. lsa must be 12 mod 16 for DMA alignment purposes. This is a DMA operation; it must be completed using mem_complete() or similar method.""" if isinstance(lsa, (int, long)): if (lsa % 16) != 12: print "ERROR LSA for mem_write_mbox() must be 12 mod 16" assert(0) # r_mbox_mma_cached = True # ref = "__mem_write_in_mbox_mma_reg_%s" % (str(psmap)) # r_mbox_mma = code.prgm.get_storage(ref) # if not isinstance(r_mbox_mma, spu.Register): # r_size_cached = False # r_mbox_mma = code.acquire_register() # if isinstance(psmap, (int, long)): # util.load_word(code, r_mbox_mma, psmap + 0x400C) # else: # util.load_word(code, r_mbox_mma, 0x400C) # code.add(spu.a(r_mbox_mma, r_mbox_mma, psmap)) # # if cache == True: # r_mbox_mma_cached = True # code.prgm.add_storage(ref, r_mbox_mma) r_mbox_mma = code.prgm.acquire_register() if isinstance(psmap, (int, long)): util.load_word(code, r_mbox_mma, psmap + 0x400C) else: util.load_word(code, r_mbox_mma, 0x400C) code.add(spu.a(r_mbox_mma, r_mbox_mma, psmap)) r_size_cached = True ref = "_const_val_4" r_size = code.prgm.get_storage(ref) if not isinstance(r_size, spu.Register): r_size_cached = False r_size = code.prgm.acquire_register() util.load_word(code, r_size, 4) if cache == True: r_size_cached = True code.prgm.add_storage(ref, r_size) mem_put(code, lsa, r_mbox_mma, r_size, tag) code.prgm.release_register(r_mbox_mma) if cache == False: #if not isinstance(psmap, (int, long)) and r_mbox_mma_cached == False: if r_size_cached == False: code.prgm.release_register(r_size) return
def save_ls_buffer(self, ls_size=None, branch=False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return
def save_ls_buffer(self, ls_size = None, branch = False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return
def TestParams(): # Run this with a stop instruction and examine the registers prgm = Program() code = prgm.get_stream() proc = Processor() #r_sum = code.acquire_register(reg = 1) r_sum = prgm.gp_return r_current = prgm.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10 ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) #code.add(spu.ori(code.gp_return, r_current, 0)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 prgm += code r = proc.execute(prgm, params=params, stop=True) assert (r[0] == 55) assert (r[1] == 0x200A) # print 'int result:', r return
def cleanup(self): """Do end-of-loop iterator code""" # Update the current count if self.mode == DEC: if self.r_step is not None: self.code.add(spu.sf(self.r_count, self.r_step, self.r_count)) else: self.code.add(spu.ai( self.r_count, self.r_count, -self.step_size())) elif self.mode == INC: if self.r_step is not None: self.code.add(spu.a(self.r_count, self.r_count, self.r_step)) else: self.code.add(spu.ai(self.r_count, self.r_count, self.step_size())) return
def TestParams(): # Run this with a stop instruction and examine the registers code = InstructionStream() proc = Processor() r_sum = code.acquire_register() r_current = code.acquire_register() # Zero the sum code.add(spu.xor(r_sum, r_sum, r_sum)) for param in [ spu_param_1, spu_param_2, spu_param_3, spu_param_4, spu_param_5, spu_param_6, spu_param_7, spu_param_8, spu_param_9, spu_param_10 ]: copy_param(code, r_current, param) code.add(spu.a(r_sum, r_sum, r_current)) code.add(spu.ceqi(r_current, r_sum, 55)) code.add(spu.brz(r_current, 2)) code.add(spu.stop(0x200A)) code.add(spu.stop(0x200B)) params = spu_exec.ExecParams() params.p1 = 1 params.p2 = 2 params.p3 = 3 params.p4 = 4 params.p5 = 5 params.p6 = 6 params.p7 = 7 params.p8 = 8 params.p9 = 9 params.p10 = 10 r = proc.execute(code, params=params) assert (r == 0xA) # print 'int result:', r # while True: # pass return
prgm.add(code) prgm.print_code() # TODO - support print prgm instead? ret = proc.execute(prgm, mode = 'int') print "ret", ret prgm = env.Program() code = prgm.get_stream() r_add = prgm.acquire_register() # Generate substream # Multiply gp_return by 2, add 1 subcode = prgm.get_stream() subcode.add(spu.shli(subcode.gp_return, subcode.gp_return, 1)) subcode.add(spu.a(subcode.gp_return, subcode.gp_return, r_add)) # Initialize gp_return, insert code code.add(spu.il(r_add, 1)) code.add(spu.il(code.gp_return, 5)) code.add(subcode) # Add 3, insert again code.add(spu.il(r_add, 2)) code.add(spu.ai(code.gp_return, code.gp_return, 3)) code.add(subcode) prgm.add(code) prgm.print_code()
prgm.add(code) prgm.print_code() # TODO - support print prgm instead? ret = proc.execute(prgm, mode='int') print "ret", ret prgm = env.Program() code = prgm.get_stream() r_add = prgm.acquire_register() # Generate substream # Multiply gp_return by 2, add 1 subcode = prgm.get_stream() subcode.add(spu.shli(subcode.gp_return, subcode.gp_return, 1)) subcode.add(spu.a(subcode.gp_return, subcode.gp_return, r_add)) # Initialize gp_return, insert code code.add(spu.il(r_add, 1)) code.add(spu.il(code.gp_return, 5)) code.add(subcode) # Add 3, insert again code.add(spu.il(r_add, 2)) code.add(spu.ai(code.gp_return, code.gp_return, 3)) code.add(subcode) prgm.add(code) prgm.print_code() ret = proc.execute(prgm, mode='int')
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = prgm.gp_return test = prgm.acquire_register() lbl_brz = prgm.get_label("BRZ") lbl_skip = prgm.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = prgm.acquire_register() r_stop = prgm.acquire_register() r_cmp = prgm.acquire_register() r_foo = prgm.gp_return spu.ori(r_foo, prgm.r_zero, 0) spu.ori(r_cnt, prgm.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 55) return