def popc(self, count, x): """ Add the number of 1 bits in each word in X to the value in count. """ temp = spu.get_active_code().acquire_register() spu.cntb(temp, x) spu.sumb(temp, temp, 0) spu.a(count, count, temp) spu.get_active_code().release_register(temp) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Reserve two variable registers count = code.acquire_register() result = code.acquire_register() # 'Load' the input vector x from register 5 x = code.acquire_register() spu.ai(x, 5, 0) # Zero count and result spu.xor(count, count, count) spu.xor(result, result, result) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) code.release_register(x) spu.set_active_code(old_code) return
def init_mm_buffer(self, addr, size, offset=0): code = spu.get_active_code() util.set_slot_value(code, self.mm_buffer, 0, addr) util.set_slot_value(code, self.mm_buffer, 1, size) util.set_slot_value(code, self.mm_buffer, 2, offset) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def synthesize(self, code): """ Render a vector with 4 pixels. """ old_code = spu.get_active_code() spu.set_active_code(code) if self.x_offset is None: raise Exception('Please call setup') if self.result is None: raise Exception('Please set result') if self.one is None: raise Exception('Please set one') # Make the part of the result positive and subtract 1 # to transform (-1,-oo) into (0,oo) self.result.v = spu.fs.ex(0, self.result) self.result.v = spu.fs.ex(self.result, self.one) # Convert the result to an unsigned int, scaling by 2^4 to put # values between 0 and 16 in the gradient. Values outside [0,16] # are 0 or FF self.result.v = spu.cfltu.ex(self.result, 169) # 173 - 169 == 4 # self.result.v = spu.sfi.ex(self.result, 255) # 173 - 169 == 4 # Extract the first two bytes from the result into the RGB positions # and set alpha to 0xFF self.result.v = spu.shufb.ex(self.result, self.ff, self.uint2rgba) # Save the result and increment the offset spu.stqd(self.result, self.x_offset, self.lsa >> 4) spu.ai(self.x_offset, self.x_offset, 16) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.buffers is None: raise Exception('Please set buffers') if self.stride is None: raise Exception('Please set stride') # Draw a square color = var.SignedWord(0x0F0F0FFF) fb0 = var.Word(self.buffers[0]) fb1 = var.Word(self.buffers[1]) stride = var.Word(self.stride) addr = var.Word(0) # Draw one line line_pixels = 256 for i in spuiter.syn_iter(code, line_pixels*4, step = 16): spu.stqx(color, addr, i) # Transfer the line to the frame buffer md_fb = spuiter.memory_desc('I', size = line_pixels) md_fb.set_addr_reg(addr.reg) addr.v = fb0 for i in spuiter.syn_iter(code, 128): md_fb.put(code, 0) addr.v = addr + stride spu.set_active_code(old_code) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def init_mm_buffer(self, addr, size, offset = 0): code = spu.get_active_code() util.set_slot_value(code, self.mm_buffer, 0, addr) util.set_slot_value(code, self.mm_buffer, 1, size) util.set_slot_value(code, self.mm_buffer, 2, offset) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg = buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg=buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def test(self, cmp, count_var): code = spu.get_active_code() self._branch_idx = len(code) spu.stop(0xB) # spu.nop(0) self._cmp = cmp self._count = count_var return
def setup(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self.consts = {} for const in constants.keys(): self.consts[const] = var.Word(constants[const]) spu.set_active_code(old_code) return
def test(self, cmp, score, x_off, y_off): code = spu.get_active_code() self._branch_idx = len(code) spu.stop(0xB) # spu.nop(0) self._cmp = cmp self._score = score self._x_off = x_off self._y_off = y_off return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.result is None: raise Exception('Please set result') spu.wrch(self.result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._load_parameters(code) log = spu_log.SPULog() log.setup(code) if self.renderer is not None: self.renderer.setup(code) self.renderer.set_one(log.consts['ONE']) r1_inc = var.SingleFloat() r2_inc = var.SingleFloat() r1 = var.SingleFloat() r2 = var.SingleFloat() result = var.SingleFloat() pattern = var.Word(0) self.ly_point.set_pattern_reg(pattern) self.ly_point.set_result_reg(result) self.ly_point.set_r_regs(r1, r2) self.ly_point.set_log(log) self.ly_point.setup(code) spu.lqa(r1, 0) spu.lqa(r2, 4) spu.lqa(r1_inc, 8) spu.lqa(r2_inc, 12) spu.lqa(pattern, 16) for y in spuiter.syn_iter(code, self.h): spu.lqa(r1, 0) for x in spuiter.syn_iter(code, self.w / 4): self.ly_point.synthesize(code) r1.v = spu.fa.ex(r1, r1_inc) if self.renderer is not None: # result.v = spu.fm.ex(r1, r2) self.renderer.set_result_reg(result) self.renderer.synthesize(code) if self.renderer is not None: self.renderer.row_complete(code) r2.v = spu.fa.ex(r2, r2_inc) # return Numeric.where(Numeric.less(results, 0), results, 0) spu.set_active_code(old_code) return
def synthesize_constants(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._one = code.acquire_register() spu.xor(self._one, self._one, self._one) spu.ai(self._one, self._one, 1) spu.cuflt(self._one, self._one, 155) if old_code is not None: spu.set_active_code(old_code) return
def _compute_ratio(self, ab, c, result): # Convert ab and c to float spu.cuflt(ab, ab, 155) spu.cuflt(c, c, 155) # Compute ab = ab + c spu.fa(ab, ab, c) # Compute c / (ab + c) fdiv(spu.get_active_code(), result, c, ab, self._one) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active=True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(-(idx - self._branch_idx - 1)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction code[self._branch_idx] = spu.nop(0, ignore_active = True) code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # FILL IN HERE # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active = True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def synthesize(self, code): if self._x_regs is None: raise Exception("Please set x_regs") if self._y_regs is None: raise Exception("Please set y_regs") if self._result is None: raise Exception("Please set result register") old_code = spu.get_active_code() spu.set_active_code(code) regs = [] if self._one is None: self.synthesize_constants(code) regs.append(self._one) ab = code.acquire_register() c = code.acquire_register() ab_temp = code.acquire_register() c_temp = code.acquire_register() result = code.acquire_register() regs = regs + [ab, c, ab_temp, c_temp] nregs = self._n_bits / 128 for i in range(nregs): # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp) # self._c( self._x_regs[i], self._y_regs[i], c, c_temp) self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp, c_temp) self._reduce_word(ab, ab_temp) self._reduce_word(c, c_temp) self._compute_ratio(ab_temp, c_temp, result) print '%d registers,' % (len(regs) + len(self._x_regs) + len(self._y_regs)), code.release_registers(regs) if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Create and initialize the variables count = var.Word(0) result = var.Word(0) x = var.Word(0) # 'Load' the input vector x from register 5 x.v = spu.ai.ex(5, 0) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def synthesize(self, code): if self._x_regs is None: raise Exception("Please set x_regs") if self._y_regs is None: raise Exception("Please set y_regs") if self._result is None: raise Exception("Please set result register") old_code = spu.get_active_code() spu.set_active_code(code) regs = [] if self._one is None: self.synthesize_constants(code) regs.append(self._one) ab = code.acquire_register() c = code.acquire_register() ab_temp = code.acquire_register() c_temp = code.acquire_register() result = code.acquire_register() regs = regs + [ab, c, ab_temp, c_temp] nregs = self._n_bits / 128 for i in range(nregs): # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp) # self._c( self._x_regs[i], self._y_regs[i], c, c_temp) self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp, c_temp) self._reduce_word(ab, ab_temp) self._reduce_word( c, c_temp) self._compute_ratio(ab_temp, c_temp, result) print '%d registers,' % (len(regs) + len(self._x_regs) + len(self._y_regs)), code.release_registers(regs) if old_code is not None: spu.set_active_code(old_code) return
def save_ls_buffer(self, ls_size=None, branch=False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return
def save_ls_buffer(self, ls_size = None, branch = False): code = spu.get_active_code() regs = [] if ls_size is None: ls_size = code.acquire_register() regs.append(ls_size) # Set the main memory address mm_offset = code.acquire_register() regs.append(mm_offset) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, self.mm_buffer) # Tranfer the buffer md = spuiter.memory_desc('b') md.set_size_reg(ls_size) md.set_addr_reg(mm_offset) md.put(code, self.ls_buffer) # Increment the main memory offset mm_size = code.acquire_register() regs.append(mm_size) spu.rotqbyi(mm_size, self.mm_buffer, 8) spu.rotqbyi(mm_offset, self.mm_buffer, 4) spu.a(mm_offset, mm_offset, mm_size) util.set_slot_value(code, self.mm_buffer, 2, mm_offset) # Reset the ls offset util.set_slot_value(code, self.ls_buffer, 2, 0) code.release_registers(regs) return
def _transfer_data(self, code, kernel, lsa, tag): """ Load the data into the SPU memory """ # Check the types if not isinstance(code, spe.InstructionStream): raise Exception('Code must be an InstructionStream') if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))): raise Exception('lsa must be an integer, Register, or Variable') old_code = spu.get_active_code() spu.set_active_code(code) # Acquire registers for address and size, if they were not supplied by the user if self.r_addr is None: r_ea_data = code.prgm.acquire_register() else: r_ea_data = self.r_addr if self.r_size is None: r_size = code.prgm.acquire_register() else: r_size = self.r_size # Create variables ea_addr = var.SignedWord(reg = r_ea_data) aligned_size = var.SignedWord(0) mod_16 = var.SignedWord(0xF) # Initialize the lsa_addr variable. if isinstance(lsa, int): # From a constant ls_addr = var.SignedWord(lsa) elif issubclass(type(lsa), (spe.Register, spe.Variable)): # From a variable ls_addr = var.SignedWord() ls_addr.v = lsa tag_var = var.SignedWord(tag) cmp = var.SignedWord(0) # Load the effective address if self.r_addr is None: if self.addr % 16 != 0: print '[get_memory] Misaligned data' util.load_word(code, ea_addr, self.addr) # Load the size, rounding up as required to be 16-byte aligned if self.r_size is None: rnd_size = self.size * var.INT_SIZES[self.typecode] if rnd_size < 16: rnd_size = 16 elif (rnd_size % 16) != 0: rnd_size += (16 - (rnd_size % 16)) util.load_word(code, aligned_size, rnd_size) else: # TODO: !!! UNIT TEST THIS !!! # Same as above, but using SPU arithemtic to round size = var.SignedWord(reg = r_size) sixteen = var.SignedWord(16) cmp.v = ((size & mod_16) == size) aligned_size.v = size + (sixteen - (size & mod_16)) spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb)) code.release_register(sixteen.reg) # Use an auxillary register for the moving ea value if the # caller supplied the address register if self.r_addr is not None: ea_load = var.SignedWord(0) ea_load.v = ea_addr else: ea_load = ea_addr # note that this is reference, not .v assignment # Transfer parameters buffer_size = var.SignedWord(16384) remaining = var.SignedWord(0) transfer_size = var.SignedWord(0) remaining.v = aligned_size # Set up the iterators to transfer at most 16k at a time xfer_iter = syn_iter(code, 0, 16384) xfer_iter.set_stop_reg(aligned_size.reg) for offset in xfer_iter: cmp.v = buffer_size > remaining spu.selb(transfer_size, buffer_size, remaining, cmp) # Transfer the data kernel(code, ls_addr, ea_load, transfer_size, tag_var) ls_addr.v = ls_addr + buffer_size ea_load.v = ea_load + buffer_size remaining.v = remaining - buffer_size # Set the tag bit to tag dma.mfc_write_tag_mask(code, 1<<tag); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Release the registers code.release_register(buffer_size.reg) code.release_register(remaining.reg) code.release_register(aligned_size.reg) code.release_register(transfer_size.reg) code.release_register(cmp.reg) code.release_register(ls_addr.reg) code.release_register(tag_var.reg) code.release_register(ea_load.reg) if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.x is None: raise Exception("Please set x") if self.result is None: raise Exception("Please set result") # exponent e = var.Word() # Working values x = var.Word() y = var.Word() z = var.Word() cmp = var.Bits() tmp = var.Word() spu.xor(cmp, cmp, cmp) spu.xor(tmp, tmp, tmp) # Set the working x x.v = self.x # Extract the exponent # int e = (((*(unsigned int *) &x) >> 23) & 0xff) - 0x7e; e.v = x >> self.consts['_23'] e.v = spu.andi.ex(e, 0xff) e.v = spu.ai.ex(e, 0x382) # 0x382 == (- 0x7E) using 10 bits # 0b 111 1110 # Extract the mantissa x.v = x & self.consts['M1'] # *(unsigned int*)&x &= 0x807fffff; x.v = x | self.consts['M2'] # *(unsigned int*)&x |= 0x3f000000; # Normalize x1, x2, e1 = y, z, tmp # if (x < SQRTHF) cmp.v = spu.fcgt.ex(self.consts['SQRTHF'], x) # (True) { ... } e1.v = spu.ai.ex(e, -1) # e -= 1; x1.v = spu.fa.ex(x, x) # x = x + x - 1.0; x1.v = spu.fs.ex(x1, self.consts['ONE']) # "" "" # (False) { ... } x2.v = spu.fs.ex(x, self.consts['ONE']) # x = x - 1.0; # Select the True/False values based on cmp e.v = spu.selb.ex(e, e1, cmp) x.v = spu.selb.ex(x2, x1, cmp) # Compute polynomial z.v = spu.fm.ex(x, x) # z = x * x; y.v = spu.fms.ex( self.consts['C1'], x, # y = (((((((( 7.0376836292E-2 * x self.consts['C2']) # - 1.1514610310E-1) * x y.v = spu.fma.ex(y, x, self.consts['C3']) # + 1.1676998740E-1) * x y.v = spu.fms.ex(y, x, self.consts['C4']) # - 1.2420140846E-1) * x y.v = spu.fma.ex(y, x, self.consts['C5']) # + 1.4249322787E-1) * x y.v = spu.fms.ex(y, x, self.consts['C6']) # - 1.6668057665E-1) * x y.v = spu.fma.ex(y, x, self.consts['C7']) # + 2.0000714765E-1) * x y.v = spu.fms.ex(y, x, self.consts['C8']) # - 2.4999993993E-1) * x y.v = spu.fma.ex(y, x, self.consts['C9']) # + 3.3333331174E-1) y.v = spu.fm.ex(y, x) # * x y.v = spu.fm.ex(y, z) # * z; y.v = spu.fma.ex(self.consts['C10'], z, y) # y += -0.5 * z; # Convert to log base 2 z.v = spu.fm.ex(y, self.consts['LOG2EA']) # z = y * LOG2EA; z.v = spu.fma.ex(x, self.consts['LOG2EA'], z) # z += x * LOG2EA; z.v = spu.fa.ex(z, y) # z += y; z.v = spu.fa.ex(z, x) # z += x; e.v = spu.csflt.ex(e, 155) # z += (float) e; z.v = spu.fa.ex(z, e) # "" "" spu.ai(self.result, z, 0) # return z spu.set_active_code(old_code) return
def isched(scode): old_active_code = spu.get_active_code() spu.set_active_code(None) # Generate the instruction dependence DAG(s) blocks = isched_gen_blocks(scode) # For each instruction, compute the max cycles to the end of the code g_critpath = critpath_block(blocks) # Apply heuristics to build an optimized InstructionStream fcode = scode.prgm.get_stream() inst_cycle = {} # For each inst, the cycle number it has in the code lastpos = -1 # Index of last instruction in the stream (excludes labels!) pipe = 0 # Current pipeline (0 = even, 1 = odd) cycle = 0 # Current cycle number for (ind, block) in enumerate(blocks): if block.label is not None: fcode.add(block.label) start = block.start g_in = block.g_in g_incnt = block.g_incnt g_out = block.g_out while len(start) > 0: # Apply heuristics to find the best instruction in the queue. # For each inst in start, compute the minimum stall time # TODO - cache this instead of computing each time? # Do this by computing the stall time when an inst is added to start. # Each time the cycle number is moved forward, reduce the stall time # by that number of cycles for each inst in start. # TODO - idea from I think Muchnick -- keep a start Q of no-stall nodes, # and a Q of nodes that would stall. Then just pull from no-stall nodes # unless empty, in which case fall back to the stall Q # would make it easy(er) to do cached stall counts best = (None, 999) for s in start: # Find the stall time of s, or maximum delay for all its deps maxstall = 0 for d in g_in[s]: if d[0] == None: continue # Compute stall time for this dep stall = d[1] - (cycle - inst_cycle[d[0]]) if stall > maxstall: maxstall = stall best = heurcompare_block(best, (s, maxstall), pipe, g_critpath, blocks, ind) inst = best[0] start.remove(inst) cycle += best[1] + 1 block.inst_cnt -= 1 fcode.add(inst) # Dual issue? if so, adjust the cycle back one. # Careful, lastpos starts out as -1. However the pipe also starts out # as 0, so the first part of the conditional will fail before lastpos # is used. # Ah - if a label occurs first in the stream, followed by say an ai, # this will fail previnst = fcode[lastpos] if (pipe == inst.cycles[0] == 1 and previnst.cycles[0] == 0 and inst_cycle[previnst] == cycle - 1): cycle -= 1 inst_cycle[inst] = cycle lastpos = len(fcode) - 1 pipe = (pipe + 1) & 1 # Evaluate all the instructions that depend on this inst. # Can any be added to start? for d in g_out[inst]: # Skip this d if it's not in the current block if d not in g_incnt: continue g_incnt[d] -= 1 if g_incnt[d] == 0: start.append(d) # Does d have depend on any insts in start? # If so, move those to the front of start # Why does this still matter? It affects ties in the heuristic.. else: # d depends on inst, and at least 1 other inst not in start. # look at the insts d depends on. if any are in start, move # them to the front of the start set. # how does this help? helps insts closer to getting into start, # get in sooner. get a larger start set for choosing best inst for e in g_in[d]: if e[0] in start: start.remove(e[0]) start.insert(0, e[0]) # end while len(start) > 0 if block.branch is not None: fcode.add(block.branch) # end for block in blocks spu.set_active_code(old_active_code) return fcode
def synthesize(self, code): self._check_inputs() old_code = spu.get_active_code() spu.set_active_code(code) zero = var.Word(reg = code.r_zero) one = self.log.consts['ONE'] two = self.consts['TWO'] x = var.Word(self.x0) r = var.Word(0) cmp = var.Word(0) x_neg = var.Word(0) fmax = var.Word(self.max_init) temp = var.SingleFloat() fmax.v = spu.cuflt.ex(fmax, 155) # Init for i in spuiter.syn_iter(code, self.max_init): # x = r[i % r_max] * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # if x == float('-infinity'): # return -10.0 # Derive Exponent total = var.Word(0) logx = var.SingleFloat() for i in spuiter.syn_iter(code, self.max_n): # x = ri * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # logx = ri - 2.0 * ri * x logx.v = spu.fm.ex(two, x) logx.v = spu.fm.ex(r, logx) logx.v = spu.fs.ex(r, logx) # abs(logx) x_neg.v = spu.fs.ex(zero, logx) cmp.v = spu.fcgt.ex(logx, zero) logx.v = spu.selb.ex(x_neg, logx, cmp) # logx.v = spu.selb.ex(logx, x_neg, cmp) # log(logx) self.log.set_result(logx) self.log.set_x(logx) self.log.synthesize(code) # total = total + x total.v = spu.fa.ex(total, logx) # return total / float(max_n) fdiv(code, self.result, total, fmax, one) spu.set_active_code(code) return
def _get_active_code(self): return spu.get_active_code()
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Sanity checks if self._x_addr is None: raise Exception("Please set x_addr") if self._y_addr is None: raise Exception("Please set y_addr") if self._n_bits is None: raise Exception("Please set n_bits") if self._m is None: raise Exception("Please set m") if self._n is None: raise Exception("Please set n") # Acquire a registers for the bit vectors and result n_vecs = self._n_bits / 128 x_regs = [code.acquire_register() for i in range(n_vecs)] y_regs = [code.acquire_register() for i in range(n_vecs)] result = code.acquire_register() x_addr = var.Word() y_addr = var.Word() if self._save_op is not None: if self._threshold is not None: threshold = var.SingleFloat(self._threshold) else: threshold = var.SingleFloat(0.0) bcmp = var.Word(0) # Setup the Tanimito kernel tan = Tanimoto() tan.set_n_bits(self._n_bits) tan.set_x_regs(x_regs) tan.set_y_regs(y_regs) tan.set_result(result) tan.synthesize_constants(code) # Setup the save op save_op = self._save_op if save_op is not None: save_op.setup() # Create the iterators xiter = spuiter.syn_iter(code, self._m) yiter = spuiter.syn_iter(code, self._n) # Synthesize the block comparison loops x_addr.v = self._x_addr for x_off in xiter: x_addr.v = x_addr + 16 * n_vecs y_addr.v = self._y_addr self._load_bit_vector(x_addr, x_regs) for y_off in yiter: y_addr.v = y_addr + 16 * n_vecs self._load_bit_vector(y_addr, y_regs) tan.synthesize(code) if save_op is not None: spu.fcgt(bcmp, result, threshold) save_op.test(bcmp, result, x_off, y_off) # /x_off if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.x is None: raise Exception("Please set x") if self.result is None: raise Exception("Please set result") # exponent e = var.Word() # Working values x = var.Word() y = var.Word() z = var.Word() cmp = var.Bits() tmp = var.Word() spu.xor(cmp, cmp, cmp) spu.xor(tmp, tmp, tmp) # Set the working x x.v = self.x # Extract the exponent # int e = (((*(unsigned int *) &x) >> 23) & 0xff) - 0x7e; e.v = x >> self.consts['_23'] e.v = spu.andi.ex(e, 0xff) e.v = spu.ai.ex(e, 0x382) # 0x382 == (- 0x7E) using 10 bits # 0b 111 1110 # Extract the mantissa x.v = x & self.consts['M1'] # *(unsigned int*)&x &= 0x807fffff; x.v = x | self.consts['M2'] # *(unsigned int*)&x |= 0x3f000000; # Normalize x1, x2, e1 = y, z, tmp # if (x < SQRTHF) cmp.v = spu.fcgt.ex(self.consts['SQRTHF'], x) # (True) { ... } e1.v = spu.ai.ex(e, -1) # e -= 1; x1.v = spu.fa.ex(x, x) # x = x + x - 1.0; x1.v = spu.fs.ex(x1, self.consts['ONE']) # "" "" # (False) { ... } x2.v = spu.fs.ex(x, self.consts['ONE']) # x = x - 1.0; # Select the True/False values based on cmp e.v = spu.selb.ex(e, e1, cmp) x.v = spu.selb.ex(x2, x1, cmp) # Compute polynomial z.v = spu.fm.ex(x, x) # z = x * x; y.v = spu.fms.ex(self.consts['C1'], x, # y = (((((((( 7.0376836292E-2 * x self.consts['C2']) # - 1.1514610310E-1) * x y.v = spu.fma.ex(y, x, self.consts['C3']) # + 1.1676998740E-1) * x y.v = spu.fms.ex(y, x, self.consts['C4']) # - 1.2420140846E-1) * x y.v = spu.fma.ex(y, x, self.consts['C5']) # + 1.4249322787E-1) * x y.v = spu.fms.ex(y, x, self.consts['C6']) # - 1.6668057665E-1) * x y.v = spu.fma.ex(y, x, self.consts['C7']) # + 2.0000714765E-1) * x y.v = spu.fms.ex(y, x, self.consts['C8']) # - 2.4999993993E-1) * x y.v = spu.fma.ex(y, x, self.consts['C9']) # + 3.3333331174E-1) y.v = spu.fm.ex(y, x) # * x y.v = spu.fm.ex(y, z) # * z; y.v = spu.fma.ex(self.consts['C10'], z, y) # y += -0.5 * z; # Convert to log base 2 z.v = spu.fm.ex( y, self.consts['LOG2EA']) # z = y * LOG2EA; z.v = spu.fma.ex(x, self.consts['LOG2EA'], z) # z += x * LOG2EA; z.v = spu.fa.ex(z, y) # z += y; z.v = spu.fa.ex(z, x) # z += x; e.v = spu.csflt.ex(e, 155) # z += (float) e; z.v = spu.fa.ex(z, e) # "" "" spu.ai(self.result, z, 0) # return z spu.set_active_code(old_code) return
def synthesize(self, code): self._check_inputs() old_code = spu.get_active_code() spu.set_active_code(code) zero = var.Word(reg=code.r_zero) one = self.log.consts['ONE'] two = self.consts['TWO'] x = var.Word(self.x0) r = var.Word(0) cmp = var.Word(0) x_neg = var.Word(0) fmax = var.Word(self.max_init) temp = var.SingleFloat() fmax.v = spu.cuflt.ex(fmax, 155) # Init for i in spuiter.syn_iter(code, self.max_init): # x = r[i % r_max] * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # if x == float('-infinity'): # return -10.0 # Derive Exponent total = var.Word(0) logx = var.SingleFloat() for i in spuiter.syn_iter(code, self.max_n): # x = ri * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # logx = ri - 2.0 * ri * x logx.v = spu.fm.ex(two, x) logx.v = spu.fm.ex(r, logx) logx.v = spu.fs.ex(r, logx) # abs(logx) x_neg.v = spu.fs.ex(zero, logx) cmp.v = spu.fcgt.ex(logx, zero) logx.v = spu.selb.ex(x_neg, logx, cmp) # logx.v = spu.selb.ex(logx, x_neg, cmp) # log(logx) self.log.set_result(logx) self.log.set_x(logx) self.log.synthesize(code) # total = total + x total.v = spu.fa.ex(total, logx) # return total / float(max_n) fdiv(code, self.result, total, fmax, one) spu.set_active_code(code) return