def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def setup(self): self._count = var.SignedWord(0) self._save_value = var.SignedWord(0) self._word_mask = var.SignedWord(array.array('I', [0xFFFFFFFF, 0, 0, 0])) if self._save_op is not None: self._save_op.setup() return
def TestStreamBufferDouble(n_spus = 1): n = 2048 a = extarray.extarray('I', range(n)) buffer_size = 32 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] n_bytes = n * 4 #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) for buffer in stream: for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, len(a)): assert(a[i] == i + i) return
def TestVecIter(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) for buffer in stream: for current in spu_vec_iter(code, md): current.v = current + current prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.buffers is None: raise Exception('Please set buffers') if self.stride is None: raise Exception('Please set stride') # Draw a square color = var.SignedWord(0x0F0F0FFF) fb0 = var.Word(self.buffers[0]) fb1 = var.Word(self.buffers[1]) stride = var.Word(self.stride) addr = var.Word(0) # Draw one line line_pixels = 256 for i in spuiter.syn_iter(code, line_pixels*4, step = 16): spu.stqx(color, addr, i) # Transfer the line to the frame buffer md_fb = spuiter.memory_desc('I', size = line_pixels) md_fb.set_addr_reg(addr.reg) addr.v = fb0 for i in spuiter.syn_iter(code, 128): md_fb.put(code, 0) addr.v = addr + stride spu.set_active_code(old_code) return
def TestContinueLabel(n_spus=1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) test = var.SignedWord(0, code) four = var.SignedWord(4, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save=True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) lsa_iter = spu_vec_iter(code, md) for buffer in stream: for current in lsa_iter: current.v = current + current test.v = (current == four) code.add(spu.gbb(test, test)) #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl)) code.add(spu.brz(test.reg, lsa_iter.continue_label)) current.v = current + current #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next)) proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, n): if i >= 4: assert (a[i] == i + i) else: #print a[i] assert (a[i] == i * 4) return
def _inc_ea(self): """ Increment the ea/count register by step size. This is used for double buffering. """ if self.r_step is not None: vstep = var.SignedWord(code = self.code, reg = self.r_step) self.current_count.v = self.current_count + vstep else: self.current_count.v = self.current_count + self.step_size() return
def _start_post(self): # Initialize the buffer size self.buffer_size = var.SignedWord(self.ibuffer_size, self.code) # Initialize the ls and tag vectors with (optionally) alternating values if self.buffer_mode == 'single': self.ls = var.SignedWord(self.lsa, self.code) self.tag = var.SignedWord(1, self.code) else: self.ls = var.SignedWord(array.array('i', [self.lsa, self.lsb, self.lsa, self.lsb]), self.code) self.tag = var.SignedWord(array.array('i', [1, 2, 1, 2]), self.code) # For double buffering, load the first buffer self._load_buffer() # Update the start label (make a new one and add it) self.start_label = self.code.prgm.get_unique_label("STREAM_BUFFER_START") self.code.add(self.start_label) return
def TestStreamBufferSingle(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 128 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) #r_bufsize = code.acquire_register() #r_lsa = code.acquire_register() #r_current = code.acquire_register() for buffer in stream: #util.load_word(code, r_bufsize, buffer_size) #code.add(spu.il(r_lsa, 0)) #loop = code.size() #code.add(spu.lqx(r_current, buffer, r_lsa)) #code.add(spu.a(r_current, r_current, r_current)) #code.add(spu.stqx(r_current, buffer, r_lsa)) #code.add(spu.ai(r_bufsize, r_bufsize, -16)) #code.add(spu.ai(r_lsa, r_lsa, 16)) #code.add(spu.brnz(r_bufsize, loop - code.size())) for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current #current.v = 5 code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def start(self, align=True, branch=True): """Do pre-loop iteration initialization""" if self.r_count is None: self.r_count = self.code.acquire_register() if self.mode == DEC: if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_count()) elif self.mode == INC: if self.r_stop is None and branch: self.r_stop = self.code.acquire_register() if self._external_start: self.code.add(spu.ai(self.r_count, self.r_start, 0)) else: util.load_word(self.code, self.r_count, self.get_start()) if branch and not self._external_stop: util.load_word(self.code, self.r_stop, self.get_count()) # /end mode if if self.r_count is not None: self.current_count = var.SignedWord(code=self.code, reg=self.r_count) # If the step size doesn't fit in an immediate value, store it in a register # (-512 < word < 511): if not (-512 < self.step_size() < 511): self.r_step = self.code.acquire_register() util.load_word(self.code, self.r_step, self.step_size()) # Label self.start_label = self.code.get_label("SYN_ITER_START_%d" % random.randint(0, 2**32)) self.code.add(self.start_label) # Create continue/branch labels so they can be referenced; they will be # added to the code in their appropriate locations. self.branch_label = self.code.get_label("SYN_ITER_BRANCH_%d" % random.randint(0, 2**32)) self.continue_label = self.code.get_label("SYN_ITER_CONTINUE_%d" % random.randint(0, 2**32)) return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n / 4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def _wait_buffer(self): # TODO - BUG HERE!! # Here's what happens: a variable 'mask' is created, then used. When this # code finishes with the variable, it calls mask.release_register() to # release the underlying register, which is no longer needed. But, # release_register() sets mask.reg to None. Although it appears mask would # go out of scope here and be garbage collected, it does not! mask is # still referred to by self.code, since instructions have been added that # reference it. The problem is that if these instructions ever need to be # rendered again -- like say, for print_code() -- mask.reg.reg is None, # which makes it impossible to render the instruction. mask = var.SignedWord(1, self.code) mask.v = mask << self.tag dma.mfc_write_tag_mask(self.code, mask) reg = dma.mfc_read_tag_status_all(self.code) self.code.prgm.release_register(reg) #mask.release_register() return
def _transfer_data(self, code, kernel, lsa, tag): """ Load the data into the SPU memory """ # Check the types if not isinstance(code, spe.InstructionStream): raise Exception('Code must be an InstructionStream') if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))): raise Exception('lsa must be an integer, Register, or Variable') old_code = spu.get_active_code() spu.set_active_code(code) # Acquire registers for address and size, if they were not supplied by the user if self.r_addr is None: r_ea_data = code.prgm.acquire_register() else: r_ea_data = self.r_addr if self.r_size is None: r_size = code.prgm.acquire_register() else: r_size = self.r_size # Create variables ea_addr = var.SignedWord(reg = r_ea_data) aligned_size = var.SignedWord(0) mod_16 = var.SignedWord(0xF) # Initialize the lsa_addr variable. if isinstance(lsa, int): # From a constant ls_addr = var.SignedWord(lsa) elif issubclass(type(lsa), (spe.Register, spe.Variable)): # From a variable ls_addr = var.SignedWord() ls_addr.v = lsa tag_var = var.SignedWord(tag) cmp = var.SignedWord(0) # Load the effective address if self.r_addr is None: if self.addr % 16 != 0: print '[get_memory] Misaligned data' util.load_word(code, ea_addr, self.addr) # Load the size, rounding up as required to be 16-byte aligned if self.r_size is None: rnd_size = self.size * var.INT_SIZES[self.typecode] if rnd_size < 16: rnd_size = 16 elif (rnd_size % 16) != 0: rnd_size += (16 - (rnd_size % 16)) util.load_word(code, aligned_size, rnd_size) else: # TODO: !!! UNIT TEST THIS !!! # Same as above, but using SPU arithemtic to round size = var.SignedWord(reg = r_size) sixteen = var.SignedWord(16) cmp.v = ((size & mod_16) == size) aligned_size.v = size + (sixteen - (size & mod_16)) spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb)) code.release_register(sixteen.reg) # Use an auxillary register for the moving ea value if the # caller supplied the address register if self.r_addr is not None: ea_load = var.SignedWord(0) ea_load.v = ea_addr else: ea_load = ea_addr # note that this is reference, not .v assignment # Transfer parameters buffer_size = var.SignedWord(16384) remaining = var.SignedWord(0) transfer_size = var.SignedWord(0) remaining.v = aligned_size # Set up the iterators to transfer at most 16k at a time xfer_iter = syn_iter(code, 0, 16384) xfer_iter.set_stop_reg(aligned_size.reg) for offset in xfer_iter: cmp.v = buffer_size > remaining spu.selb(transfer_size, buffer_size, remaining, cmp) # Transfer the data kernel(code, ls_addr, ea_load, transfer_size, tag_var) ls_addr.v = ls_addr + buffer_size ea_load.v = ea_load + buffer_size remaining.v = remaining - buffer_size # Set the tag bit to tag dma.mfc_write_tag_mask(code, 1<<tag); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Release the registers code.release_register(buffer_size.reg) code.release_register(remaining.reg) code.release_register(aligned_size.reg) code.release_register(transfer_size.reg) code.release_register(cmp.reg) code.release_register(ls_addr.reg) code.release_register(tag_var.reg) code.release_register(ea_load.reg) if old_code is not None: spu.set_active_code(old_code) return
def setup(self): self.ls_buffer = var.SignedWord(0) self.mm_buffer = var.SignedWord(0) return
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True): import time # n_spus = 8 # buffer_size = 16 # 16 ints/buffer # n_buffers = 4 # 4 buffers/spu # n_buffers = size / buffer_size # size = buffer_size * n_buffers * n_spus # data = array.array('I', range(size + 2)) #data = env.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16) code = env.ParallelInstructionStream() # code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0])) # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % ( # r_zero, r_ea_data, r_ls_data, r_size, r_tag) # Load the effective address if data.buffer_info()[0] % 16 == 0: util.load_word(code, r_ea_data, data.buffer_info()[0]) else: util.load_word(code, r_ea_data, data.buffer_info()[0] + 8) ea_start = data.buffer_info()[0] # Iterate over each buffer for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)): # ea = var.SignedWord(code = code, reg = r_ea_data) # print 'n_iters:', size / buffer_size # for i in syn_range(code, size / buffer_size): # code.add(spu.stop(0xB)) # Load the size util.load_word(code, r_size, buffer_size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, ea, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) count = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, buffer_size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) count.v = count + 1 code.add(spu.stqx(count, r_zero, 0)) # code.release_register(r_current) current.release_registers(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag) # Set the tag bit to 13 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # code.add(spu.stop(0xB)) # Update ea # ea.v = ea + (buffer_size * 4) # /for ea address # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) if not run_code: return code # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() #data.copy_from(data_array.buffer_info()[0], len(data_array)) def print_blocks(): for i in range(0, size, buffer_size): # print data[i:(i + buffer_size)] print data[i + buffer_size], print '' # print_blocks() s = time.time() r = proc.execute(code, n_spus = n_spus) # r = proc.execute(code) t = time.time() - s # print_blocks() return t