def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def vector_from_array(code, r_target, a): """ Generate the instructions to fill a vector register with the values from an array. """ prgm = code.prgm r0 = r_target r1 = prgm.acquire_register() r2 = prgm.acquire_register() r3 = prgm.acquire_register() load_word(code, r0, a[0], True) load_word(code, r1, a[1], True) code.add(spu.rotqbyi(r1, r1, 12)) # rotate qw by bytes load_word(code, r2, a[2], True) code.add(spu.rotqbyi(r2, r2, 8)) load_word(code, r3, a[3], True) code.add(spu.rotqbyi(r3, r3, 4)) code.add(spu.a(r0, r0, r1)) code.add(spu.a(r0, r0, r2)) code.add(spu.a(r0, r0, r3)) prgm.release_register(r1) prgm.release_register(r2) prgm.release_register(r3) return
def set_slot_value(code, reg, slot, value): """ Set the value in reg[slot] with value. If value is a register, use the value from the preferred slot (value[0]). If value is a constant, load it into reg[slot], preserving the values in the other slots. """ prgm = code.prgm if slot not in [0,1,2,3]: raise Exception("Invalid SIMD slot: " + slot) mask = prgm.acquire_register() vector_from_array(code, mask, [0xFFFFFFFF, 0, 0, 0]) if not issubclass(type(value), (spe.Register, spe.Variable)): r_value = prgm.acquire_register() load_word(code, r_value, value) else: r_value = value code.add(spu.rotqbyi(reg, reg, slot * 4)) code.add(spu.selb(reg, reg, r_value, mask)) code.add(spu.rotqbyi(reg, reg, (4 - slot) * 4)) prgm.release_register(mask) if not issubclass(type(value), (spe.Register, spe.Variable)): prgm.release_register(r_value) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode="fp") x_test = array.array("f", [1.0, 2.0, 3.0, 4.0]) y_test = array.array("f", [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert result == r_test return
def set_slot_value(code, reg, slot, value): """ Set the value in reg[slot] with value. If value is a register, use the value from the preferred slot (value[0]). If value is a constant, load it into reg[slot], preserving the values in the other slots. """ prgm = code.prgm if slot not in [0, 1, 2, 3]: raise Exception("Invalid SIMD slot: " + slot) mask = prgm.acquire_register() vector_from_array(code, mask, [0xFFFFFFFF, 0, 0, 0]) if not issubclass(type(value), (spe.Register, spe.Variable)): r_value = prgm.acquire_register() load_word(code, r_value, value) else: r_value = value code.add(spu.rotqbyi(reg, reg, slot * 4)) code.add(spu.selb(reg, reg, r_value, mask)) code.add(spu.rotqbyi(reg, reg, (4 - slot) * 4)) prgm.release_register(mask) if not issubclass(type(value), (spe.Register, spe.Variable)): prgm.release_register(r_value) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert (result == r_test) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async = True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert(result == (i + 0x10)) proc.join(spe_id) return
def TestInt2(i0 = 0, i1 = 1): i2 = i0 + i1 i3 = i1 + i2 code = InstructionStream() proc = Processor() r_loop = 4 r_address = 5 r0 = 6 r1 = 7 r2 = 8 r3 = 9 # Load arguments into a quadword ################# # Pack quadword # ################# def load_value_int32(code, reg, value, clear = False): # obviously, value should be 32 bit integer code.add(spu.ilhu(reg, value / pow(2, 16))) # immediate load halfword upper code.add(spu.iohl(reg, value % pow(2, 16))) # immediate or halfword lower if clear: code.add(spu.shlqbyi(reg, reg, 12)) # shift left qw by bytes, clears right bytes return load_value_int32(code, r0, i0, True) load_value_int32(code, r1, i1, True) code.add(spu.rotqbyi(r1, r1, 12)) # rotate qw by bytes load_value_int32(code, r2, i2, True) code.add(spu.rotqbyi(r2, r2, 8)) load_value_int32(code, r3, i3, True) code.add(spu.rotqbyi(r3, r3, 4)) code.add(spu.a(r0, r0, r1)) code.add(spu.a(r0, r0, r2)) code.add(spu.a(r0, r0, r3)) ########## # Main loop to calculate Fibnoccai sequence load_value_int32(code, r_address, pow(2, 16), clear_bits = False) # start at 64K load_value_int32(code, r_loop, 0, clear_bits = False) start_label = code.size() + 1 code.add(spu.sfi(r_loop, r_loop, 1)) code.add(spu.brnz(r_loop, (-(next - start_label) * spu.WORD_SIZE))) code.add(spu.stop(0x2005)) r = proc.execute(code) # assert(r == 12) # print 'int result:', r return
def _reduce_word(self, words, result): """ Add-reduce a vector of words into the preferred slot of result. """ for i in range(4): spu.a(result, words, result) spu.rotqbyi(words, words, 4) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active=True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(-(idx - self._branch_idx - 1)) return
def _toggle(self, var): """ Use rotate to toggle between two preferred slot values in a vector. """ if self.buffer_mode == 'double': self.code.add(spu.rotqbyi(var.reg, var.reg, 4)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active = True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def copy_param(code, target, source): """ Copy a parameter from source reg to preferred slot in the target reg. For params in slot 0, this is just and add immediate. For params in other slots, the source is rotated. Note that other values in the source are copied, too. """ if source[SLOT] != 0: code.add(spu.rotqbyi(target, source[REG], source[SLOT] * 4)) else: code.add(spu.ai(target, source[REG], 0)) return
def save_ls_buffer(self, ls_size = None, branch = False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return
def save_ls_buffer(self, ls_size=None, branch=False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return