def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Reserve two variable registers count = code.acquire_register() result = code.acquire_register() # 'Load' the input vector x from register 5 x = code.acquire_register() spu.ai(x, 5, 0) # Zero count and result spu.xor(count, count, count) spu.xor(result, result, result) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) code.release_register(x) spu.set_active_code(old_code) return
def synthesize(self, code): """ Render a vector with 4 pixels. """ old_code = spu.get_active_code() spu.set_active_code(code) if self.x_offset is None: raise Exception('Please call setup') if self.result is None: raise Exception('Please set result') if self.one is None: raise Exception('Please set one') # Make the part of the result positive and subtract 1 # to transform (-1,-oo) into (0,oo) self.result.v = spu.fs.ex(0, self.result) self.result.v = spu.fs.ex(self.result, self.one) # Convert the result to an unsigned int, scaling by 2^4 to put # values between 0 and 16 in the gradient. Values outside [0,16] # are 0 or FF self.result.v = spu.cfltu.ex(self.result, 169) # 173 - 169 == 4 # self.result.v = spu.sfi.ex(self.result, 255) # 173 - 169 == 4 # Extract the first two bytes from the result into the RGB positions # and set alpha to 0xFF self.result.v = spu.shufb.ex(self.result, self.ff, self.uint2rgba) # Save the result and increment the offset spu.stqd(self.result, self.x_offset, self.lsa >> 4) spu.ai(self.x_offset, self.x_offset, 16) spu.set_active_code(old_code) return
def fdiv(code, d, x, y, one = None): """ Single-precision floating point division for x / y """ Y = code.acquire_registers(3) t = code.acquire_register() regs = Y[:] regs.append(t) if one is None: one = code.acquire_register() spu.xor(one, one, one) spu.ai(one, one, 1) spu.cuflt(one, one, 155) regs.append(one) # Compute 1/y (from SPU ISA 1.1, p208, Normal case) spu.frest(Y[0], y) spu.fi(Y[1], y, Y[0]) spu.fnms(t, y, Y[1], one) spu.fma(Y[2], t, Y[1], Y[1]) # Compute x * (1/y) spu.fm(d, x, Y[2]) code.release_registers(regs) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def synthesize_constants(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._one = code.acquire_register() spu.xor(self._one, self._one, self._one) spu.ai(self._one, self._one, 1) spu.cuflt(self._one, self._one, 155) if old_code is not None: spu.set_active_code(old_code) return
def cleanup(self): """Do end-of-loop iterator code""" # Update the current count if self.mode == DEC: if self.r_step is not None: self.code.add(spu.sf(self.r_count, self.r_step, self.r_count)) else: self.code.add(spu.ai( self.r_count, self.r_count, -self.step_size())) elif self.mode == INC: if self.r_step is not None: self.code.add(spu.a(self.r_count, self.r_count, self.r_step)) else: self.code.add(spu.ai(self.r_count, self.r_count, self.step_size())) return
def start(self, align=True, branch=True): """Do pre-loop iteration initialization""" if self.r_count is None: self.r_count = self.code.acquire_register() if self.mode == DEC: if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_count()) elif self.mode == INC: if self.r_stop is None and branch: self.r_stop = self.code.acquire_register() if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_start()) if branch and not self._external_stop: util.load_word(self.code, self.r_stop, self.get_count()) # /end mode if if self.r_count is not None: self.current_count = var.SignedWord(code=self.code, reg=self.r_count) # If the step size doesn't fit in an immediate value, store it in a register # (-512 < word < 511): if not (-512 < self.step_size() < 511): self.r_step = self.code.acquire_register() util.load_word(self.code, self.r_step, self.step_size()) # Label self.start_label = self.code.get_label("SYN_ITER_START_%d" % random.randint(0, 2**32)) self.code.add(self.start_label) # Create continue/branch labels so they can be referenced; they will be # added to the code in their appropriate locations. self.branch_label = self.code.get_label("SYN_ITER_BRANCH_%d" % random.randint(0, 2**32)) self.continue_label = self.code.get_label("SYN_ITER_CONTINUE_%d" % random.randint(0, 2**32)) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name=55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='fp') print r return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name = 55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'int', stop = True, debug = True) assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'fp') print r return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory code = ParallelInstructionStream() proc = Processor() code.raw_data_size = 128 * 8 r = code.acquire_register() code.add(spu.ai(r, r, 0xCAFE)) code.add(spu.ai(r, r, 0xBABE)) code.add(spu.stop(0x2000)) r = proc.execute(code, mode='async', n_spus=6) for speid in r: proc.join(speid) assert (True) return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory code = ParallelInstructionStream() proc = Processor() code.raw_data_size = 128*8 r = code.acquire_register() code.add(spu.ai(r, r, 0xCAFE)) code.add(spu.ai(r, r, 0xBABE)) code.add(spu.stop(0x2000)) r = proc.execute(code, mode='async', n_spus = 6) for speid in r: proc.join(speid) assert(True) return
def start(self, align = True, branch = True): """Do pre-loop iteration initialization""" if self.r_count is None: self.r_count = self.code.prgm.acquire_register() if self.mode == DEC: if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_count()) elif self.mode == INC: if self.r_stop is None and branch: self.r_stop = self.code.prgm.acquire_register() if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_start()) if branch and not self._external_stop: util.load_word(self.code, self.r_stop, self.get_count()) # /end mode if if self.r_count is not None: self.current_count = var.SignedWord(code = self.code, reg = self.r_count) # If the step size doesn't fit in an immediate value, store it in a register # (-512 < word < 511): if not (-512 < self.step_size() < 511): self.r_step = self.code.prgm.acquire_register() util.load_word(self.code, self.r_step, self.step_size()) # Label self.start_label = self.code.prgm.get_unique_label("SYN_ITER_START") self.code.add(self.start_label) # Create continue/branch labels so they can be referenced; they will be # added to the code in their appropriate locations. self.branch_label = self.code.prgm.get_unique_label("SYN_ITER_BRANCH") self.continue_label = self.code.prgm.get_unique_label("SYN_ITER_CONTINUE") return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory prgm = ParallelProgram() code = prgm.get_stream() proc = Processor() code.raw_data_size = 128*8 r = prgm.acquire_register() code.add(spu.ai(r, r, 0x2FE)) code.add(spu.ai(r, r, 0x2BE)) code.add(spu.stop(0x1FFF)) prgm += code r = proc.execute(prgm, async = True, mode='void', n_spus = 6) for speid in r: proc.join(speid) assert(True) return
def TestParallel(): # Run this with a stop instruction and examine the registers and memory prgm = ParallelProgram() code = prgm.get_stream() proc = Processor() code.raw_data_size = 128 * 8 r = prgm.acquire_register() code.add(spu.ai(r, r, 0x2FE)) code.add(spu.ai(r, r, 0x2BE)) code.add(spu.stop(0x1FFF)) prgm += code r = proc.execute(prgm, async=True, mode='void', n_spus=6) for speid in r: proc.join(speid) assert (True) return
def copy_param(code, target, source): """ Copy a parameter from source reg to preferred slot in the target reg. For params in slot 0, this is just and add immediate. For params in other slots, the source is rotated. Note that other values in the source are copied, too. """ if source[SLOT] != 0: code.add(spu.rotqbyi(target, source[REG], source[SLOT] * 4)) else: code.add(spu.ai(target, source[REG], 0)) return
def load_word(code, r_target, word, clear=False, zero=True): """If r0 is not set to 0, the zero parameter should be set to False""" if zero and (-512 < word < 511): code.add(spu.ai(r_target, code.r_zero, word)) elif (word & 0x7FFF) == word: code.add(spu.il(r_target, word)) elif (word & 0x3FFFF) == word: code.add(spu.ila(r_target, word)) else: code.add(spu.ilhu(r_target, (word & 0xFFFF0000) >> 16)) code.add(spu.iohl(r_target, (word & 0xFFFF))) if clear: code.add(spu.shlqbyi(r_target, r_target, 12)) return
def end(self, branch = True): """Do post-loop iterator code""" if self.hint == True: self.code.add(spu.hbrr(self.branch_label, self.start_label)) if self.mode == DEC: # branch if r_count is not zero (CR) # Note that this relies on someone (e.g. cleanup()) setting the # condition register properly. if branch: self.code.add(self.branch_label) self.code.add(spu.brnz(self.r_count, self.start_label)) # Reset the counter in case this is a nested loop util.load_word(self.code, self.r_count, self.get_count()) elif self.mode == INC: # branch if r_current < r_stop if branch: r_cmp_gt = self.code.prgm.acquire_register() self.code.add(spu.cgt(r_cmp_gt, self.r_stop, self.r_count)) self.code.add(self.branch_label) self.code.add(spu.brnz(r_cmp_gt, self.start_label)) self.code.prgm.release_register(r_cmp_gt) # Reset the the current value in case this is a nested loop if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_start()) if self.r_count is not None: self.code.prgm.release_register(self.r_count) if self.r_stop is not None and not self._external_stop: self.code.prgm.release_register(self.r_stop) return
def TestMFC(): size = 32 #data_array = array.array('I', range(size)) #data = synspu.aligned_memory(size, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) data = extarray.extarray('I', range(size)) code = synspu.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) print 'array ea: %X' % (data.buffer_info()[0]) print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address print 'test ea: %X' % data.buffer_info()[0] util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size code.add(spu.ai(r_size, r_zero, size * 4)) # Load the tag code.add(spu.ai(r_tag, r_zero, 2)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Increment the data values by 1 using an unrolled loop (no branches) r_current = code.acquire_register() for lsa in range(0, size * 4, 16): code.add(spu.lqa(r_current, (lsa >> 2))) code.add(spu.ai(r_current, r_current, 1)) code.add(spu.stqa(r_current, (lsa >> 2))) code.release_register(r_current) # Store the values back to main memory # Load the data into address 0 mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = synspu.Processor() # code.print_code() #print data_array proc.execute(code) #data.copy_from(data_array.buffer_info()[0], len(data_array)) for i in range(size): assert (data[i] == i + 1) return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.lib.util as util import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() proc = env.Processor() # Generate substream # Multiply gp_return by 2, add 1 subcode = prgm.get_stream() subcode.add(spu.shli(subcode.gp_return, subcode.gp_return, 1)) subcode.add(spu.ai(subcode.gp_return, subcode.gp_return, 1)) # Initialize gp_return, insert code code.add(spu.il(code.gp_return, 5)) code.add(subcode) # Add 3, insert again code.add(spu.ai(code.gp_return, code.gp_return, 3)) code.add(subcode) #code.print_code() prgm.add(code) prgm.print_code() # TODO - support print prgm instead? ret = proc.execute(prgm, mode='int') print "ret", ret
def copy_register(self, other): return self.code.add(spu.ai(self, other, 0))
code = prgm.get_stream() reg = prgm.acquire_register() foo = prgm.acquire_register(reg_name = 5) code.add(prgm.get_label("FOO")) code.add(spu.il(foo, 0xCAFE)) code.add(spu.ilhu(reg, 0xDEAD)) code.add(spu.iohl(reg, 0xBEEF)) code.add(spu.stqd(reg, code.r_zero, 4)) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = code.gp_return r_stop = prgm.acquire_register(reg_name = 9) r_cmp = prgm.acquire_register() code.add(spu.ori(r_cnt, code.r_zero, 0)) code.add(spu.il(r_stop, 5)) code.add(lbl_loop) code.add(spu.ceq(r_cmp, r_cnt, r_stop)) code.add(spu.brnz(r_cmp, prgm.get_label("BREAK"))) code.add(spu.ai(r_cnt, r_cnt, 1)) code.add(spu.br(prgm.get_label("LOOP"))) code.add(lbl_break) app = SPUApp(code) app.MainLoop()
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.x is None: raise Exception("Please set x") if self.result is None: raise Exception("Please set result") # exponent e = var.Word() # Working values x = var.Word() y = var.Word() z = var.Word() cmp = var.Bits() tmp = var.Word() spu.xor(cmp, cmp, cmp) spu.xor(tmp, tmp, tmp) # Set the working x x.v = self.x # Extract the exponent # int e = (((*(unsigned int *) &x) >> 23) & 0xff) - 0x7e; e.v = x >> self.consts['_23'] e.v = spu.andi.ex(e, 0xff) e.v = spu.ai.ex(e, 0x382) # 0x382 == (- 0x7E) using 10 bits # 0b 111 1110 # Extract the mantissa x.v = x & self.consts['M1'] # *(unsigned int*)&x &= 0x807fffff; x.v = x | self.consts['M2'] # *(unsigned int*)&x |= 0x3f000000; # Normalize x1, x2, e1 = y, z, tmp # if (x < SQRTHF) cmp.v = spu.fcgt.ex(self.consts['SQRTHF'], x) # (True) { ... } e1.v = spu.ai.ex(e, -1) # e -= 1; x1.v = spu.fa.ex(x, x) # x = x + x - 1.0; x1.v = spu.fs.ex(x1, self.consts['ONE']) # "" "" # (False) { ... } x2.v = spu.fs.ex(x, self.consts['ONE']) # x = x - 1.0; # Select the True/False values based on cmp e.v = spu.selb.ex(e, e1, cmp) x.v = spu.selb.ex(x2, x1, cmp) # Compute polynomial z.v = spu.fm.ex(x, x) # z = x * x; y.v = spu.fms.ex( self.consts['C1'], x, # y = (((((((( 7.0376836292E-2 * x self.consts['C2']) # - 1.1514610310E-1) * x y.v = spu.fma.ex(y, x, self.consts['C3']) # + 1.1676998740E-1) * x y.v = spu.fms.ex(y, x, self.consts['C4']) # - 1.2420140846E-1) * x y.v = spu.fma.ex(y, x, self.consts['C5']) # + 1.4249322787E-1) * x y.v = spu.fms.ex(y, x, self.consts['C6']) # - 1.6668057665E-1) * x y.v = spu.fma.ex(y, x, self.consts['C7']) # + 2.0000714765E-1) * x y.v = spu.fms.ex(y, x, self.consts['C8']) # - 2.4999993993E-1) * x y.v = spu.fma.ex(y, x, self.consts['C9']) # + 3.3333331174E-1) y.v = spu.fm.ex(y, x) # * x y.v = spu.fm.ex(y, z) # * z; y.v = spu.fma.ex(self.consts['C10'], z, y) # y += -0.5 * z; # Convert to log base 2 z.v = spu.fm.ex(y, self.consts['LOG2EA']) # z = y * LOG2EA; z.v = spu.fma.ex(x, self.consts['LOG2EA'], z) # z += x * LOG2EA; z.v = spu.fa.ex(z, y) # z += y; z.v = spu.fa.ex(z, x) # z += x; e.v = spu.csflt.ex(e, 155) # z += (float) e; z.v = spu.fa.ex(z, e) # "" "" spu.ai(self.result, z, 0) # return z spu.set_active_code(old_code) return
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True): import time # n_spus = 8 # buffer_size = 16 # 16 ints/buffer # n_buffers = 4 # 4 buffers/spu # n_buffers = size / buffer_size # size = buffer_size * n_buffers * n_spus # data = array.array('I', range(size + 2)) #data = env.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16) code = env.ParallelInstructionStream() # code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0])) # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % ( # r_zero, r_ea_data, r_ls_data, r_size, r_tag) # Load the effective address if data.buffer_info()[0] % 16 == 0: util.load_word(code, r_ea_data, data.buffer_info()[0]) else: util.load_word(code, r_ea_data, data.buffer_info()[0] + 8) ea_start = data.buffer_info()[0] # Iterate over each buffer for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)): # ea = var.SignedWord(code = code, reg = r_ea_data) # print 'n_iters:', size / buffer_size # for i in syn_range(code, size / buffer_size): # code.add(spu.stop(0xB)) # Load the size util.load_word(code, r_size, buffer_size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, ea, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) count = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, buffer_size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) count.v = count + 1 code.add(spu.stqx(count, r_zero, 0)) # code.release_register(r_current) current.release_registers(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag) # Set the tag bit to 13 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # code.add(spu.stop(0xB)) # Update ea # ea.v = ea + (buffer_size * 4) # /for ea address # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) if not run_code: return code # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() #data.copy_from(data_array.buffer_info()[0], len(data_array)) def print_blocks(): for i in range(0, size, buffer_size): # print data[i:(i + buffer_size)] print data[i + buffer_size], print '' # print_blocks() s = time.time() r = proc.execute(code, n_spus = n_spus) # r = proc.execute(code) t = time.time() - s # print_blocks() return t
def TestMFC(): import corepy.lib.extarray as extarray import corepy.arch.spu.platform as synspu size = 32 #data_array = array.array('I', range(size)) #data = synspu.aligned_memory(size, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) data = extarray.extarray('I', range(size)) code = synspu.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) print 'array ea: %X' % (data.buffer_info()[0]) print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address print 'test ea: %X' % data.buffer_info()[0] util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size code.add(spu.ai(r_size, r_zero, size * 4)) # Load the tag code.add(spu.ai(r_tag, r_zero, 2)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1<<2); # Wait for the transfer to complete mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) r_current = code.acquire_register() for lsa in range(0, size * 4, 16): code.add(spu.lqa(r_current, (lsa >> 2))) code.add(spu.ai(r_current, r_current, 1)) code.add(spu.stqa(r_current, (lsa >> 2))) code.release_register(r_current) # Store the values back to main memory # Load the data into address 0 mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1<<2); # Wait for the transfer to complete mfc_read_tag_status_all(code); # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = synspu.Processor() # code.print_code() #print data_array proc.execute(code) #data.copy_from(data_array.buffer_info()[0], len(data_array)) for i in range(size): assert(data[i] == i + 1) return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert (r == None) print 'int result:', r # while True: # pass return
def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name=13) r20 = prgm.acquire_register(reg_name=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
spu.set_active_code(code) psmap = extarray.extarray('I', 131072 / 4) data = extarray.extarray('I', range(0, 16)) r_sum = prgm.gp_return r_cnt = prgm.acquire_register() spu.xor(r_sum, r_sum, r_sum) load_word(code, r_cnt, ITERS) lbl_loop = prgm.get_label("loop") code.add(lbl_loop) reg = dma.spu_read_in_mbox(code) spu.ai(r_sum, r_sum, 1) dma.spu_write_out_intr_mbox(code, r_sum) #dma.spu_write_out_mbox(code, reg) prgm.release_register(reg) spu.ai(r_cnt, r_cnt, -1) spu.brnz(r_cnt, lbl_loop) reg = dma.spu_read_signal1(code) spu.ori(code.gp_return, reg, 0) spu.il(r_cnt, 0) spu.il(r_sum, 16 * 4) r_data = prgm.acquire_register()
import corepy.arch.spu.lib.util as util prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) r_cnt = prgm.acquire_register() r_cmp = prgm.acquire_register() r_sum = prgm.acquire_register() spu.il(r_cnt, 32) spu.il(r_sum, 0) lbl_loop = prgm.get_unique_label("LOOP") code.add(lbl_loop) spu.ai(r_sum, r_sum, 1) spu.ceqi(r_cmp, r_cnt, 2) spu.brz(r_cmp, lbl_loop) spu.ai(r_sum, r_sum, 10) #src = prgm.acquire_register() #tmp = prgm.acquire_registers(3) #dst = prgm.acquire_registers(2) #spu.il(tmp[0], 1) #spu.il(tmp[1], 2) #spu.il(tmp[2], 3) #spu.fma(src, tmp[0], tmp[1], tmp[2]) #spu.fa(dst[0], src, src)
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.x is None: raise Exception("Please set x") if self.result is None: raise Exception("Please set result") # exponent e = var.Word() # Working values x = var.Word() y = var.Word() z = var.Word() cmp = var.Bits() tmp = var.Word() spu.xor(cmp, cmp, cmp) spu.xor(tmp, tmp, tmp) # Set the working x x.v = self.x # Extract the exponent # int e = (((*(unsigned int *) &x) >> 23) & 0xff) - 0x7e; e.v = x >> self.consts['_23'] e.v = spu.andi.ex(e, 0xff) e.v = spu.ai.ex(e, 0x382) # 0x382 == (- 0x7E) using 10 bits # 0b 111 1110 # Extract the mantissa x.v = x & self.consts['M1'] # *(unsigned int*)&x &= 0x807fffff; x.v = x | self.consts['M2'] # *(unsigned int*)&x |= 0x3f000000; # Normalize x1, x2, e1 = y, z, tmp # if (x < SQRTHF) cmp.v = spu.fcgt.ex(self.consts['SQRTHF'], x) # (True) { ... } e1.v = spu.ai.ex(e, -1) # e -= 1; x1.v = spu.fa.ex(x, x) # x = x + x - 1.0; x1.v = spu.fs.ex(x1, self.consts['ONE']) # "" "" # (False) { ... } x2.v = spu.fs.ex(x, self.consts['ONE']) # x = x - 1.0; # Select the True/False values based on cmp e.v = spu.selb.ex(e, e1, cmp) x.v = spu.selb.ex(x2, x1, cmp) # Compute polynomial z.v = spu.fm.ex(x, x) # z = x * x; y.v = spu.fms.ex(self.consts['C1'], x, # y = (((((((( 7.0376836292E-2 * x self.consts['C2']) # - 1.1514610310E-1) * x y.v = spu.fma.ex(y, x, self.consts['C3']) # + 1.1676998740E-1) * x y.v = spu.fms.ex(y, x, self.consts['C4']) # - 1.2420140846E-1) * x y.v = spu.fma.ex(y, x, self.consts['C5']) # + 1.4249322787E-1) * x y.v = spu.fms.ex(y, x, self.consts['C6']) # - 1.6668057665E-1) * x y.v = spu.fma.ex(y, x, self.consts['C7']) # + 2.0000714765E-1) * x y.v = spu.fms.ex(y, x, self.consts['C8']) # - 2.4999993993E-1) * x y.v = spu.fma.ex(y, x, self.consts['C9']) # + 3.3333331174E-1) y.v = spu.fm.ex(y, x) # * x y.v = spu.fm.ex(y, z) # * z; y.v = spu.fma.ex(self.consts['C10'], z, y) # y += -0.5 * z; # Convert to log base 2 z.v = spu.fm.ex( y, self.consts['LOG2EA']) # z = y * LOG2EA; z.v = spu.fma.ex(x, self.consts['LOG2EA'], z) # z += x * LOG2EA; z.v = spu.fa.ex(z, y) # z += y; z.v = spu.fa.ex(z, x) # z += x; e.v = spu.csflt.ex(e, 155) # z += (float) e; z.v = spu.fa.ex(z, e) # "" "" spu.ai(self.result, z, 0) # return z spu.set_active_code(old_code) return
import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) r_cnt = prgm.acquire_register() load_word(code, r_cnt, 0x10000) br_loop = code.size() spu.ai(r_cnt, r_cnt, -1) spu.brnz(r_cnt, br_loop - code.size()) prgm.add(code) prgm.print_code() for i in xrange(0, 10000): proc.execute(prgm) #if i % 25 == 0: # print "sleep" # time.sleep(1)
import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) r_cnt = prgm.acquire_register() load_word(code, r_cnt, 0x10000) br_loop = code.size() spu.ai(r_cnt, r_cnt, -1) spu.brnz(r_cnt, br_loop - code.size()) prgm.add(code) prgm.print_code() for i in xrange(0, 10000): proc.execute(prgm) #if i % 25 == 0: # print "sleep" # time.sleep(1)
fb = 124 y0 = 120 y1 = 121 y2 = 122 t1 = 119 result = 118 ione = 110 fone = 111 insts = [ # Create fone = 1.0, fa = 2.0 and fb = 4.0 spu.ai(ione, 0, 1), spu.ai(ia, 0, 2), spu.ai(ib, 0, 4), spu.cuflt(fone, ione, 155), spu.cuflt(fa, ia, 155), spu.cuflt(fb, ib, 155), # Compute 1/fb spu.frest(y0, fb), spu.fi(y1, fb, y0), spu.fnms(t1, fb, y1, fone), spu.fma(y2, t1, y1, y1), spu.fm(result, fa, y2) ] for inst in insts:
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) r = proc.execute(code, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert(r == None) print 'int result:', r # while True: # pass return
fb = 124 y0 = 120 y1 = 121 y2 = 122 t1 = 119 result = 118 ione = 110 fone = 111 insts = [ # Create fone = 1.0, fa = 2.0 and fb = 4.0 spu.ai(ione, 0, 1), spu.ai(ia, 0, 2), spu.ai(ib, 0, 4), spu.cuflt(fone, ione, 155), spu.cuflt(fa, ia, 155), spu.cuflt(fb, ib, 155), # Compute 1/fb spu.frest(y0, fb), spu.fi(y1, fb, y0), spu.fnms(t1, fb, y1, fone), spu.fma(y2, t1, y1, y1), spu.fm(result, fa, y2) ]
def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name = 13) r20 = prgm.acquire_register(reg_name = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop = True) # , debug = True) #print 'int result:', r assert(r[0] == 0) assert(r[1] == 0x200D) return
prgm = env.Program() code = prgm.get_stream() reg = prgm.acquire_register() foo = prgm.acquire_register(reg_name=5) code.add(prgm.get_label("FOO")) code.add(spu.il(foo, 0xCAFE)) code.add(spu.ilhu(reg, 0xDEAD)) code.add(spu.iohl(reg, 0xBEEF)) code.add(spu.stqd(reg, code.r_zero, 4)) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = code.gp_return r_stop = prgm.acquire_register(reg_name=9) r_cmp = prgm.acquire_register() code.add(spu.ori(r_cnt, code.r_zero, 0)) code.add(spu.il(r_stop, 5)) code.add(lbl_loop) code.add(spu.ceq(r_cmp, r_cnt, r_stop)) code.add(spu.brnz(r_cmp, prgm.get_label("BREAK"))) code.add(spu.ai(r_cnt, r_cnt, 1)) code.add(spu.br(prgm.get_label("LOOP"))) code.add(lbl_break) app = SPUApp(code) app.MainLoop()
import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.lib.util as util import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() proc = env.Processor() # Generate substream # Multiply gp_return by 2, add 1 subcode = prgm.get_stream() subcode.add(spu.shli(subcode.gp_return, subcode.gp_return, 1)) subcode.add(spu.ai(subcode.gp_return, subcode.gp_return, 1)) # Initialize gp_return, insert code code.add(spu.il(code.gp_return, 5)) code.add(subcode) # Add 3, insert again code.add(spu.ai(code.gp_return, code.gp_return, 3)) code.add(subcode) #code.print_code() prgm.add(code) prgm.print_code() # TODO - support print prgm instead? ret = proc.execute(prgm, mode = 'int') print "ret", ret
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = prgm.gp_return test = prgm.acquire_register() lbl_brz = prgm.get_label("BRZ") lbl_skip = prgm.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = prgm.acquire_register() r_stop = prgm.acquire_register() r_cmp = prgm.acquire_register() r_foo = prgm.gp_return spu.ori(r_foo, prgm.r_zero, 0) spu.ori(r_cnt, prgm.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 55) return
spu.set_active_code(code) psmap = extarray.extarray('I', 131072 / 4) data = extarray.extarray('I', range(0, 16)) r_sum = prgm.gp_return r_cnt = prgm.acquire_register() spu.xor(r_sum, r_sum, r_sum) load_word(code, r_cnt, ITERS) lbl_loop = prgm.get_label("loop") code.add(lbl_loop) reg = dma.spu_read_in_mbox(code) spu.ai(r_sum, r_sum, 1) dma.spu_write_out_intr_mbox(code, r_sum) #dma.spu_write_out_mbox(code, reg) prgm.release_register(reg) spu.ai(r_cnt, r_cnt, -1) spu.brnz(r_cnt, lbl_loop) reg = dma.spu_read_signal1(code) spu.ori(code.gp_return, reg, 0) spu.il(r_cnt, 0) spu.il(r_sum, 16 * 4)
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg = 13) r20 = code.acquire_register(reg = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return