def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.buffers is None: raise Exception('Please set buffers') if self.stride is None: raise Exception('Please set stride') # Draw a square color = var.SignedWord(0x0F0F0FFF) fb0 = var.Word(self.buffers[0]) fb1 = var.Word(self.buffers[1]) stride = var.Word(self.stride) addr = var.Word(0) # Draw one line line_pixels = 256 for i in spuiter.syn_iter(code, line_pixels*4, step = 16): spu.stqx(color, addr, i) # Transfer the line to the frame buffer md_fb = spuiter.memory_desc('I', size = line_pixels) md_fb.set_addr_reg(addr.reg) addr.v = fb0 for i in spuiter.syn_iter(code, 128): md_fb.put(code, 0) addr.v = addr + stride spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._load_parameters(code) log = spu_log.SPULog() log.setup(code) if self.renderer is not None: self.renderer.setup(code) self.renderer.set_one(log.consts['ONE']) r1_inc = var.SingleFloat() r2_inc = var.SingleFloat() r1 = var.SingleFloat() r2 = var.SingleFloat() result = var.SingleFloat() pattern = var.Word(0) self.ly_point.set_pattern_reg(pattern) self.ly_point.set_result_reg(result) self.ly_point.set_r_regs(r1, r2) self.ly_point.set_log(log) self.ly_point.setup(code) spu.lqa(r1, 0) spu.lqa(r2, 4) spu.lqa(r1_inc, 8) spu.lqa(r2_inc, 12) spu.lqa(pattern, 16) for y in spuiter.syn_iter(code, self.h): spu.lqa(r1, 0) for x in spuiter.syn_iter(code, self.w / 4): self.ly_point.synthesize(code) r1.v = spu.fa.ex(r1, r1_inc) if self.renderer is not None: # result.v = spu.fm.ex(r1, r2) self.renderer.set_result_reg(result) self.renderer.synthesize(code) if self.renderer is not None: self.renderer.row_complete(code) r2.v = spu.fa.ex(r2, r2_inc) # return Numeric.where(Numeric.less(results, 0), results, 0) spu.set_active_code(old_code) return
def DoubleBufferExample(n_spus=6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = array.array('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = Processor() r = proc.execute(code, n_spus=n_spus) for i in range(2, len(a)): try: assert (a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def DoubleBufferExample(n_spus = 6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = extarray.extarray('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = env.Processor() r = proc.execute(code, n_spus = n_spus) for i in range(2, len(a)): try: assert(a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode='async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode = 'async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n / 4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n/4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Sanity checks if self._x_addr is None: raise Exception("Please set x_addr") if self._y_addr is None: raise Exception("Please set y_addr") if self._n_bits is None: raise Exception("Please set n_bits") if self._m is None: raise Exception("Please set m") if self._n is None: raise Exception("Please set n") # Acquire a registers for the bit vectors and result n_vecs = self._n_bits / 128 x_regs = [code.acquire_register() for i in range(n_vecs)] y_regs = [code.acquire_register() for i in range(n_vecs)] result = code.acquire_register() x_addr = var.Word() y_addr = var.Word() if self._save_op is not None: if self._threshold is not None: threshold = var.SingleFloat(self._threshold) else: threshold = var.SingleFloat(0.0) bcmp = var.Word(0) # Setup the Tanimito kernel tan = Tanimoto() tan.set_n_bits(self._n_bits) tan.set_x_regs(x_regs) tan.set_y_regs(y_regs) tan.set_result(result) tan.synthesize_constants(code) # Setup the save op save_op = self._save_op if save_op is not None: save_op.setup() # Create the iterators xiter = spuiter.syn_iter(code, self._m) yiter = spuiter.syn_iter(code, self._n) # Synthesize the block comparison loops x_addr.v = self._x_addr for x_off in xiter: x_addr.v = x_addr + 16 * n_vecs y_addr.v = self._y_addr self._load_bit_vector(x_addr, x_regs) for y_off in yiter: y_addr.v = y_addr + 16 * n_vecs self._load_bit_vector(y_addr, y_regs) tan.synthesize(code) if save_op is not None: spu.fcgt(bcmp, result, threshold) save_op.test(bcmp, result, x_off, y_off) # /x_off if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): self._check_inputs() old_code = spu.get_active_code() spu.set_active_code(code) zero = var.Word(reg=code.r_zero) one = self.log.consts['ONE'] two = self.consts['TWO'] x = var.Word(self.x0) r = var.Word(0) cmp = var.Word(0) x_neg = var.Word(0) fmax = var.Word(self.max_init) temp = var.SingleFloat() fmax.v = spu.cuflt.ex(fmax, 155) # Init for i in spuiter.syn_iter(code, self.max_init): # x = r[i % r_max] * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # if x == float('-infinity'): # return -10.0 # Derive Exponent total = var.Word(0) logx = var.SingleFloat() for i in spuiter.syn_iter(code, self.max_n): # x = ri * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # logx = ri - 2.0 * ri * x logx.v = spu.fm.ex(two, x) logx.v = spu.fm.ex(r, logx) logx.v = spu.fs.ex(r, logx) # abs(logx) x_neg.v = spu.fs.ex(zero, logx) cmp.v = spu.fcgt.ex(logx, zero) logx.v = spu.selb.ex(x_neg, logx, cmp) # logx.v = spu.selb.ex(logx, x_neg, cmp) # log(logx) self.log.set_result(logx) self.log.set_x(logx) self.log.synthesize(code) # total = total + x total.v = spu.fa.ex(total, logx) # return total / float(max_n) fdiv(code, self.result, total, fmax, one) spu.set_active_code(code) return
def SpeedTest(n_spus=6, n_floats=6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in range(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = Processor() start = time.time() r = proc.execute(code, n_spus=n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long( fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return
def SpeedTest(n_spus = 6, n_floats = 6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in xrange(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = env.Processor() prgm += code start = time.time() r = proc.execute(prgm, n_spus = n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return
def synthesize(self, code): self._check_inputs() old_code = spu.get_active_code() spu.set_active_code(code) zero = var.Word(reg = code.r_zero) one = self.log.consts['ONE'] two = self.consts['TWO'] x = var.Word(self.x0) r = var.Word(0) cmp = var.Word(0) x_neg = var.Word(0) fmax = var.Word(self.max_init) temp = var.SingleFloat() fmax.v = spu.cuflt.ex(fmax, 155) # Init for i in spuiter.syn_iter(code, self.max_init): # x = r[i % r_max] * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # if x == float('-infinity'): # return -10.0 # Derive Exponent total = var.Word(0) logx = var.SingleFloat() for i in spuiter.syn_iter(code, self.max_n): # x = ri * x * (1.0 - x) self._next_r(r) temp.v = spu.fs.ex(one, x) x.v = spu.fm.ex(x, temp) x.v = spu.fm.ex(r, x) # logx = ri - 2.0 * ri * x logx.v = spu.fm.ex(two, x) logx.v = spu.fm.ex(r, logx) logx.v = spu.fs.ex(r, logx) # abs(logx) x_neg.v = spu.fs.ex(zero, logx) cmp.v = spu.fcgt.ex(logx, zero) logx.v = spu.selb.ex(x_neg, logx, cmp) # logx.v = spu.selb.ex(logx, x_neg, cmp) # log(logx) self.log.set_result(logx) self.log.set_x(logx) self.log.synthesize(code) # total = total + x total.v = spu.fa.ex(total, logx) # return total / float(max_n) fdiv(code, self.result, total, fmax, one) spu.set_active_code(code) return