def RunTest(test): from corepy.arch.spu.platform import InstructionStream, Processor code = InstructionStream() spu.set_active_code(code) test() code.print_code() proc = Processor() proc.execute(code) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma code = InstructionStream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg = code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) proc = Processor() result = proc.execute(code, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert(result == r_test) return
def DoubleBufferExample(n_spus=6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = array.array('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = Processor() r = proc.execute(code, n_spus=n_spus) for i in range(2, len(a)): try: assert (a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) code.print_code(hex=True) r = proc.execute(code, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) code.print_code(hex=True) r = proc.execute(code, mode='fp') print r return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma code = InstructionStream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg = code.fp_return) r.v = spu.fa.ex(x, y) proc = Processor() result = proc.execute(code, mode='fp') assert(result == (1.0 + 2.0)) return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert (r == 0xD) return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert r == 0xD return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
def SpeedTest(n_spus=6, n_floats=6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in range(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = Processor() start = time.time() r = proc.execute(code, n_spus=n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long( fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return