def _wait_buffer(self): # TODO - BUG HERE!! # Here's what happens: a variable 'mask' is created, then used. When this # code finishes with the variable, it calls mask.release_register() to # release the underlying register, which is no longer needed. But, # release_register() sets mask.reg to None. Although it appears mask would # go out of scope here and be garbage collected, it does not! mask is # still referred to by self.code, since instructions have been added that # reference it. The problem is that if these instructions ever need to be # rendered again -- like say, for print_code() -- mask.reg.reg is None, # which makes it impossible to render the instruction. mask = var.SignedWord(1, self.code) mask.v = mask << self.tag dma.mfc_write_tag_mask(self.code, mask) reg = dma.mfc_read_tag_status_all(self.code) self.code.prgm.release_register(reg) #mask.release_register() return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
def _transfer_data(self, code, kernel, lsa, tag): """ Load the data into the SPU memory """ # Check the types if not isinstance(code, spe.InstructionStream): raise Exception('Code must be an InstructionStream') if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))): raise Exception('lsa must be an integer, Register, or Variable') old_code = spu.get_active_code() spu.set_active_code(code) # Acquire registers for address and size, if they were not supplied by the user if self.r_addr is None: r_ea_data = code.prgm.acquire_register() else: r_ea_data = self.r_addr if self.r_size is None: r_size = code.prgm.acquire_register() else: r_size = self.r_size # Create variables ea_addr = var.SignedWord(reg = r_ea_data) aligned_size = var.SignedWord(0) mod_16 = var.SignedWord(0xF) # Initialize the lsa_addr variable. if isinstance(lsa, int): # From a constant ls_addr = var.SignedWord(lsa) elif issubclass(type(lsa), (spe.Register, spe.Variable)): # From a variable ls_addr = var.SignedWord() ls_addr.v = lsa tag_var = var.SignedWord(tag) cmp = var.SignedWord(0) # Load the effective address if self.r_addr is None: if self.addr % 16 != 0: print '[get_memory] Misaligned data' util.load_word(code, ea_addr, self.addr) # Load the size, rounding up as required to be 16-byte aligned if self.r_size is None: rnd_size = self.size * var.INT_SIZES[self.typecode] if rnd_size < 16: rnd_size = 16 elif (rnd_size % 16) != 0: rnd_size += (16 - (rnd_size % 16)) util.load_word(code, aligned_size, rnd_size) else: # TODO: !!! UNIT TEST THIS !!! # Same as above, but using SPU arithemtic to round size = var.SignedWord(reg = r_size) sixteen = var.SignedWord(16) cmp.v = ((size & mod_16) == size) aligned_size.v = size + (sixteen - (size & mod_16)) spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb)) code.release_register(sixteen.reg) # Use an auxillary register for the moving ea value if the # caller supplied the address register if self.r_addr is not None: ea_load = var.SignedWord(0) ea_load.v = ea_addr else: ea_load = ea_addr # note that this is reference, not .v assignment # Transfer parameters buffer_size = var.SignedWord(16384) remaining = var.SignedWord(0) transfer_size = var.SignedWord(0) remaining.v = aligned_size # Set up the iterators to transfer at most 16k at a time xfer_iter = syn_iter(code, 0, 16384) xfer_iter.set_stop_reg(aligned_size.reg) for offset in xfer_iter: cmp.v = buffer_size > remaining spu.selb(transfer_size, buffer_size, remaining, cmp) # Transfer the data kernel(code, ls_addr, ea_load, transfer_size, tag_var) ls_addr.v = ls_addr + buffer_size ea_load.v = ea_load + buffer_size remaining.v = remaining - buffer_size # Set the tag bit to tag dma.mfc_write_tag_mask(code, 1<<tag); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Release the registers code.release_register(buffer_size.reg) code.release_register(remaining.reg) code.release_register(aligned_size.reg) code.release_register(transfer_size.reg) code.release_register(cmp.reg) code.release_register(ls_addr.reg) code.release_register(tag_var.reg) code.release_register(ea_load.reg) if old_code is not None: spu.set_active_code(old_code) return
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True): import time # n_spus = 8 # buffer_size = 16 # 16 ints/buffer # n_buffers = 4 # 4 buffers/spu # n_buffers = size / buffer_size # size = buffer_size * n_buffers * n_spus # data = array.array('I', range(size + 2)) #data = env.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16) code = env.ParallelInstructionStream() # code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0])) # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % ( # r_zero, r_ea_data, r_ls_data, r_size, r_tag) # Load the effective address if data.buffer_info()[0] % 16 == 0: util.load_word(code, r_ea_data, data.buffer_info()[0]) else: util.load_word(code, r_ea_data, data.buffer_info()[0] + 8) ea_start = data.buffer_info()[0] # Iterate over each buffer for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)): # ea = var.SignedWord(code = code, reg = r_ea_data) # print 'n_iters:', size / buffer_size # for i in syn_range(code, size / buffer_size): # code.add(spu.stop(0xB)) # Load the size util.load_word(code, r_size, buffer_size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, ea, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) count = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, buffer_size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) count.v = count + 1 code.add(spu.stqx(count, r_zero, 0)) # code.release_register(r_current) current.release_registers(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag) # Set the tag bit to 13 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # code.add(spu.stop(0xB)) # Update ea # ea.v = ea + (buffer_size * 4) # /for ea address # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) if not run_code: return code # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() #data.copy_from(data_array.buffer_info()[0], len(data_array)) def print_blocks(): for i in range(0, size, buffer_size): # print data[i:(i + buffer_size)] print data[i + buffer_size], print '' # print_blocks() s = time.time() r = proc.execute(code, n_spus = n_spus) # r = proc.execute(code) t = time.time() - s # print_blocks() return t
# Set the parameters for a GET command abi = a.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, abi[0]) # Main Memory address of array a spu.il(r_size, a.itemsize * abi[1]) # Size of array a in bytes spu.il(r_tag, 12) # DMA tag 12 # Issue a DMA GET command dma.mfc_get(code, r_lsa, r_mma, r_size, r_tag) # Wait for completion # Set the completion mask; here we complete tag 12 spu.il(r_tag, 1 << 12) dma.mfc_write_tag_mask(code, r_tag) dma.mfc_read_tag_status_all(code) # Set the parameters for a PUT command bbi = b.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, bbi[0]) # Main Memory address of array b spu.il(r_size, b.itemsize * bbi[1]) # Size of array b in bytes spu.il(r_tag, 12) # DMA tag 12 # Issue a DMA PUT command dma.mfc_put(code, r_lsa, r_mma, r_size, r_tag) # Wait for completion # Set the completion mask; here we complete tag 12
# Set the parameters for a GET command abi = a.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, abi[0]) # Main Memory address of array a spu.il(r_size, a.itemsize * abi[1]) # Size of array a in bytes spu.il(r_tag, 12) # DMA tag 12 # Issue a DMA GET command dma.mfc_get(code, r_lsa, r_mma, r_size, r_tag) # Wait for completion # Set the completion mask; here we complete tag 12 spu.il(r_tag, 1 << 12) dma.mfc_write_tag_mask(code, r_tag) dma.mfc_read_tag_status_all(code) # Set the parameters for a PUT command bbi = b.buffer_info() spu.il(r_lsa, 0x1000) # Local Store address 0x1000 load_word(code, r_mma, bbi[0]) # Main Memory address of array b spu.il(r_size, b.itemsize * bbi[1]) # Size of array b in bytes spu.il(r_tag, 12) # DMA tag 12 # Issue a DMA PUT command dma.mfc_put(code, r_lsa, r_mma, r_size, r_tag) # Wait for completion