def fb_draw(): code0 = synspu.InstructionStream() code1 = synspu.InstructionStream() proc = synspu.Processor() fb = cell_fb.framebuffer() cell_fb.fb_open(fb) draw0 = FBDraw() draw0.set_buffers(cell_fb.fb_addr(fb, 0), cell_fb.fb_addr(fb, 1)) draw0.set_stride(fb.stride) draw0.synthesize(code0) draw1 = FBDraw() draw1.set_buffers(cell_fb.fb_addr(fb, 1), cell_fb.fb_addr(fb, 0))cell_fb.fb_addr(fb, 0)) draw1.set_stride(fb.stride) draw1.synthesize(code1) while True: # cell_fb.fb_clear(fb, 0) proc.execute(code0) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 0) # cell_fb.fb_clear(fb, 1) proc.execute(code1) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 1) cell_fb.fb_close(fb) return
def TestDecrementer(): code = synspu.InstructionStream() spu_write_decr(code, 0x7FFFFFFFl) spu_start_decr(code) # Get a message from the PPU spu_read_in_mbox(code) reg = spu_read_decr(code) spu_write_out_mbox(code, reg) spu_stop_decr(code) proc = synspu.Processor() spe_id = proc.execute(code, async=True) print 'test is sleeping for 1 second' time.sleep(1) synspu.spu_exec.write_in_mbox(spe_id, 0x44CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spu said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def test_stream_popc(): code = synspu.InstructionStream() proc = synspu.Processor() bits = array.array('I', range(1024)) for i in range(0, 1024, 4): bits[i] = 0x01010101 # 4 bits bits[i + 1] = 0xFFFFFFFF # 32 bits bits[i + 2] = 0x10101010 # 4 bits bits[i + 3] = 0xFF0FF0F0 # 20 bits = 60 bits total # bits[i] = 1 # bits[i+1] = 2 # bits[i+2] = 3 # bits[i+3] = 4 #abits = synspu.aligned_memory(len(bits), typecode = 'I') #abits.copy_to(bits.buffer_info()[0], len(bits)) popc = syn_popc_stream() popc.set_stream_addr(bits.buffer_info()[0]) popc.set_stream_size(len(bits)) popc.synthesize(code) count = proc.execute(code, mode='mbox') print '-->', count assert (count == 60 * 1024 / 4) return
def TestMbox(): code = synspu.InstructionStream() # Send a message to the PPU spu_write_out_mbox(code, 0xDEADBEEFl) # Get a message from the PPU reg = spu_read_in_mbox(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_in_mbox(spe_id, 0x88CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size): # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[i] = r1_range[0] ranges[4 + i] = r2_range[0] ranges[8 + i] = r1_inc ranges[12 + i] = r2_inc # Setup the pattern vector bits = _pattern2vector(pattern) # Copy the paramters to aligned buffers #a_ranges = synspu.aligned_memory(len(ranges), typecode='I') #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges)) #a_pattern = synspu.aligned_memory(len(bits), typecode='I') #a_pattern.copy_to(bits.buffer_info()[0], len(bits)) renderer = MailboxRenderer() ly_block = LyapunovBlock() ly_block.set_size(size[0], size[1]) #ly_block.set_range(a_ranges) #ly_block.set_pattern(a_pattern) ly_block.set_range(ranges) ly_block.set_pattern(bits) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() ly_block.synthesize(code) proc = synspu.Processor() spe_id = proc.execute(code, async=True) for i in range(size[0] * size[1]): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) # for x in range(size[0]): # r2 = r2_range[0] + r2_inc # print 'col:', x, r1, r2 # for y in range(size[1]): # results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n) # r2 += r2_inc # r1 += r1_inc return
def TestVecIter(n_spus=1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save=True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) for buffer in stream: for current in spu_vec_iter(code, md): current.v = current + current proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, n): assert (a[i] == i + i) return
def TestTanimoto(): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) x_regs = code.acquire_registers(2) y_regs = code.acquire_registers(2) result = code.acquire_register() tan = Tanimoto() tan.set_n_bits(256) tan.set_x_regs(x_regs) tan.set_y_regs(y_regs) tan.set_result_reg(result) tan.synthesize(code) code.print_code() proc.execute(code) # TODO: Do a real test, not just a synthesis test return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.isa as spu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) spe_id = proc.execute(code, mode='async') for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def TestAll(): import corepy.arch.spu.platform as env code = env.InstructionStream() spu.set_active_code(code) a = code.acquire_register() b = code.acquire_register() c = code.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) code.print_code() proc = env.Processor() proc.execute(code) return
def DoubleBufferExample(n_spus=6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = extarray.extarray('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(2, len(a)): try: assert (a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode='async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def __init__(self): # Code and memory buffers self.code = env.InstructionStream() self.regs = extarray.extarray('I', 128 * 4) self.regs.clear() # Runtime parameters self.speid = None self.reg_lsa = None self.proc = None self.synthesize() return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n / 4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def TestContinueLabel(n_spus=1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) test = var.SignedWord(0, code) four = var.SignedWord(4, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save=True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) lsa_iter = spu_vec_iter(code, md) for buffer in stream: for current in lsa_iter: current.v = current + current test.v = (current == four) code.add(spu.gbb(test, test)) #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl)) code.add(spu.brz(test.reg, lsa_iter.continue_label)) current.v = current + current #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next)) proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, n): if i >= 4: assert (a[i] == i + i) else: #print a[i] assert (a[i] == i * 4) return
def test_syn(kernel): code = synspu.InstructionStream() proc = synspu.Processor() popc = kernel() popc.synthesize(code) params = synspu.spu_exec.ExecParams() params.p7 = 0x01010101 # 4 bits params.p8 = 0xFFFFFFFF # 32 bits params.p9 = 0x10101010 # 4 bits params.p10 = 0xFF0FF0F0 # 20 bits = 60 bits total count = proc.execute(code, mode='mbox', params=params) assert (count == 60) return
def TestStreamBufferSingle(n_spus=1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 128 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save=True) if n_spus > 1: stream = parallel(stream) #r_bufsize = code.acquire_register() #r_lsa = code.acquire_register() #r_current = code.acquire_register() for buffer in stream: #util.load_word(code, r_bufsize, buffer_size) #code.add(spu.il(r_lsa, 0)) #loop = code.size() #code.add(spu.lqx(r_current, buffer, r_lsa)) #code.add(spu.a(r_current, r_current, r_current)) #code.add(spu.stqx(r_current, buffer, r_lsa)) #code.add(spu.ai(r_bufsize, r_bufsize, -16)) #code.add(spu.ai(r_lsa, r_lsa, 16)) #code.add(spu.brnz(r_bufsize, loop - code.size())) for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current #current.v = 5 code.add(spu.stqx(current, lsa, buffer)) proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, n): assert (a[i] == i + i) return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode='async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' % ( synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def _startSPU(self): self.ctx = ctx = env.spu_exec.alloc_context() # Execute a no-op instruction stream so the prolog is executed code = env.InstructionStream() code.add(spu.nop(code.r_zero)) code.cache_code() itemsize = code.render_code.itemsize code_len = len(code.render_code) * itemsize if code_len % 16 != 0: code_len += 16 - (code_len % 16) code_lsa = 0x40000 - code_len env.spu_exec.run_stream(ctx, code.inst_addr(), code_len, code_lsa, code_lsa) self.localstore = extarray.extarray('I', 262144 / 4) self.localstore.set_memory(ctx.spuls) return
def GenerateStream(self, step=None): code = env.InstructionStream() txt = self.editCtrl.GetText().split('\n') txtlen = len(txt) for i in xrange(0, txtlen): # For the stop case, want all instructions except the current one to be # STOP instructions. cmd = txt[i].strip() if step != None and i != step: if cmd == "" or cmd[0] == '#': continue if cmd[-1] == ":": # Label - better parsing? code.add(code.get_label(cmd[:-1])) else: code.add(spu.stop(0x2FFF)) continue if self.editCtrl.IsBreakSet(i): code.add(spu.stop(0x2FFF)) continue if cmd != "" and cmd[0] != '#': if cmd[-1] == ":": # Label - better parsing? inst = code.get_label(cmd[:-1]) else: # Instruction strcmd = re.sub("Label\((.*?)\)", "code.get_label('\\1')", cmd) try: inst = eval('spu.%s' % strcmd) except: print 'Error creating instruction: %s' % cmd code.add(inst) code.cache_code() return code
def TestSignal(): code = synspu.InstructionStream() # Get a signal from the PPU reg = spu_read_signal1(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_signal(spe_id, 1, 0xCAFEBABEl) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'sig said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestStreamBufferDouble(n_spus=1): n = 2048 a = extarray.extarray('I', range(n)) buffer_size = 32 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] n_bytes = n * 4 #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) for buffer in stream: for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current code.add(spu.stqx(current, lsa, buffer)) proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, len(a)): assert (a[i] == i + i) return
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': code = env.InstructionStream() proc = env.Processor() spu.set_active_code(code) r_cnt = code.acquire_register() load_word(code, r_cnt, 0x10000) br_loop = code.size() spu.ai(r_cnt, r_cnt, -1) spu.brnz(r_cnt, br_loop - code.size()) code.print_code() for i in xrange(0, 10000):
def MemoryDescExample(data_size=20000): """ This example uses a memory descriptor to move 20k integers back and forth between main memory and the SPU local store. Each value is incremented by 1 while on the SPU. Memory descriptors are a general purpose method for describing a region of memory. Memory is described by a typecode, address, and size. Memory descriptors can be initialized by hand or from an array or buffer object. For main memory, memory descriptors are useful for transfering data between main memory and an SPU's local store. The get/put methods on a memory descriptor generate the SPU code to move data of any size between main memory and local store. Memory descriptors can also be used with spu_vec_iters to describe the region of memory to iterate over. The typecode in the memory descriptor is used to determine the type for the loop induction variable. Note that there is currently no difference between memory descriptors for main memory and local store. It's up to the user to make sure the memory descriptor settings make sense in the current context. (this will probably change in the near future) Note: get/put currently use loops rather than display lists for transferring data over 16k. """ code = env.InstructionStream() proc = env.Processor() code.debug = True spu.set_active_code(code) # Create a python array data = extarray.extarray('I', range(data_size)) # Align the data in the array #a_data = aligned_memory(data_size, typecode = 'I') #a_data.copy_to(data.buffer_info()[0], data_size) # Create memory descriptor for the data in main memory data_desc = memory_desc('I') #data_desc.from_array(a_data) data_desc.from_array(data) # Transfer the data to 0x0 in the local store data_desc.get(code, 0) # Create memory descriptor for the data in the local store for use # in the iterator lsa_data = memory_desc('i', 0, data_size) # Add one to each value for x in spu_vec_iter(code, lsa_data): x.v = x + 1 # Transfer the data back to main memory data_desc.put(code, 0) dma.spu_write_out_mbox(code, 0xCAFE) # Execute the synthetic program # code.print_code() spe_id = proc.execute(code, async=True) proc.join(spe_id) # Copy it back to the Python array #a_data.copy_from(data.buffer_info()[0], data_size) for i in xrange(data_size): assert (data[i] == i + 1) return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1 << 12) # Wait for the transfer to complete dma.mfc_read_tag_status_all(code) # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) # code.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1 << 13) # Wait for the transfer to complete dma.mfc_read_tag_status_all(code) # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() r = proc.execute(code) for i in range(0, size): assert (data[i] == i + i) return
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
def TestMFC(): size = 32 #data_array = array.array('I', range(size)) #data = synspu.aligned_memory(size, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) data = extarray.extarray('I', range(size)) code = synspu.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) print 'array ea: %X' % (data.buffer_info()[0]) print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address print 'test ea: %X' % data.buffer_info()[0] util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size code.add(spu.ai(r_size, r_zero, size * 4)) # Load the tag code.add(spu.ai(r_tag, r_zero, 2)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Increment the data values by 1 using an unrolled loop (no branches) r_current = code.acquire_register() for lsa in range(0, size * 4, 16): code.add(spu.lqa(r_current, (lsa >> 2))) code.add(spu.ai(r_current, r_current, 1)) code.add(spu.stqa(r_current, (lsa >> 2))) code.release_register(r_current) # Store the values back to main memory # Load the data into address 0 mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = synspu.Processor() # code.print_code() #print data_array proc.execute(code) #data.copy_from(data_array.buffer_info()[0], len(data_array)) for i in range(size): assert (data[i] == i + 1) return
def generate(self, results, patterns, r1_range, r2_range, max_init, max_n, size, n_spus=6): # Connect to the framebuffer #fb = cell_fb.framebuffer() #cell_fb.fb_open(fb) buffer = extarray.extarray('B', size[0] * size[1] * 4) buffer.clear() # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = [0 for i in range(n_spus)] #a_ranges = [0 for i in range(n_spus)] # Slice and dice for parallel execution spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)] spu_slices[-1][1] += size[1] % n_spus offset = 0.0 for ispu in range(n_spus): ranges[ispu] = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[ispu][ i] = r1_range[0] + float(i) * r1_inc # horizontal is simd ranges[ispu][4 + i] = r2_range[0] + offset ranges[ispu][8 + i] = r1_inc * 4.0 ranges[ispu][12 + i] = r2_inc # print ranges # Copy the paramters to aligned buffers #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I') #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu])) offset += r2_inc * spu_slices[ispu][1] # Setup the pattern vector for pattern in patterns: if len(pattern) != len(patterns[0]): raise Exception('All patterns must be the same length') bits = [_pattern2vector(pattern) for pattern in patterns] #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I') pattern = extarray.extarray('I', len(bits[0])) # Create the instruction streams codes = [] n = len(patterns) * 10 offset = 0 for ispu in range(n_spus): renderer = FBRenderer() renderer.set_lsa(0x100) #renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset) renderer.set_addr(buffer.buffer_info()[0] + offset) renderer.set_width(size[0]) #renderer.set_stride(fb.stride) renderer.set_stride(size[0]) ly_block = LyapunovBlock() ly_block.set_size(*spu_slices[i]) #ly_block.set_range(a_ranges[ispu]) ly_block.set_range(ranges[ispu]) #ly_block.set_pattern(a_pattern) ly_block.set_pattern(pattern) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() # code.set_debug(True) codes.append(code) #offset += spu_slices[i][1] * fb.stride * 4 offset += spu_slices[i][1] * size[0] * 4 # for i in spuiter.syn_range(code, n): ly_block.synthesize(code) # code.print_code() proc = synspu.Processor() #cell_fb.fb_clear(fb, 0) buffer.clear() import time ids = [0 for i in range(n_spus)] start = time.time() ipattern = 0 n_patterns = len(patterns) len_bits = len(bits[0]) pattern_inc = 1 for i in range(n): #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits) # TODO - better/faster for j in xrange(0, len_bits): pattern[j] = bits[ipattern][j] for ispu in range(n_spus): ids[ispu] = proc.execute(codes[ispu], async=True) for ispu in range(n_spus): proc.join(ids[ispu]) #cell_fb.fb_wait_vsync(fb) #cell_fb.fb_flip(fb, 0) # TODO - write buffer to image file #im = Image.frombuffer("RGBA", size, buffer.tostring(), "raw", "RGBA", 0, 1) imgbuf = Image.new("RGBA", size) arr = [(buffer[i + 3], buffer[i + 2], buffer[i + 1], 0xFF) for i in xrange(0, len(buffer), 4)] imgbuf.putdata(arr) imgbuf.save("lyapunov_%d.png" % ipattern) ipattern += pattern_inc if (ipattern == (n_patterns - 1)) or (ipattern == 0): pattern_inc *= -1 print ipattern stop = time.time() print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start)) #cell_fb.fb_close(fb) return