예제 #1
0
def RunTest(test):
  from corepy.arch.spu.platform import InstructionStream, Processor

  code = InstructionStream()
  spu.set_active_code(code)

  test()
  
  code.print_code()
  proc = Processor()
  proc.execute(code)
  return
예제 #2
0
def TestFloatArray():
  from corepy.arch.spu.platform import InstructionStream, Processor
  import corepy.arch.spu.lib.dma as dma

  code = InstructionStream()
  spu.set_active_code(code)

  x = SingleFloat([1.0, 2.0, 3.0, 4.0])
  y = SingleFloat([0.5, 1.5, 2.5, 3.5])
  sum = SingleFloat(0.0)

  sum.v = spu.fa.ex(x, y)

  r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg = code.fp_return)

  for i in range(4):
    r.v = spu.fa.ex(sum, r)
    spu.rotqbyi(sum, sum, 4)
  
  proc = Processor()
  result = proc.execute(code, mode='fp')

  x_test = array.array('f', [1.0, 2.0, 3.0, 4.0])
  y_test = array.array('f', [0.5, 1.5, 2.5, 3.5])
  r_test = 0.0
  for i in range(4):
    r_test += x_test[i] + y_test[i]

  assert(result == r_test)
  
  return
예제 #3
0
파일: spu_basics.py 프로젝트: tmaone/efi
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = array.array('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
예제 #4
0
파일: spu_basics.py 프로젝트: tmaone/efi
def SimpleSPU():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    #x    = code.acquire_register()
    x = code.gp_return
    test = code.acquire_register()

    spu.xor(x, x, x)  # zero x
    spu.ai(x, x, 11)  # x = x + 11
    spu.ai(x, x, 31)  # x = x + 31

    spu.ceqi(test, x, 42)  # test = (x == 42)

    # If test is false (all 0s), skip the stop(0x100A) instruction
    spu.brz(test, 2)
    spu.stop(0x100A)
    spu.stop(0x100B)

    code.print_code(hex=True)
    r = proc.execute(code, mode='int', stop=True, debug=True)
    assert (r[0] == 42)
    assert (r[1] == 0x100A)

    code = InstructionStream()
    spu.set_active_code(code)

    util.load_float(code, code.fp_return, 3.14)

    code.print_code(hex=True)
    r = proc.execute(code, mode='fp')
    print r
    return
예제 #5
0
def TestFloatScalar():
  from corepy.arch.spu.platform import InstructionStream, Processor
  import corepy.arch.spu.lib.dma as dma

  code = InstructionStream()
  spu.set_active_code(code)

  x = SingleFloat(1.0)
  y = SingleFloat(2.0)
  r = SingleFloat(0.0, reg = code.fp_return)

  r.v = spu.fa.ex(x, y)
  
  proc = Processor()
  result = proc.execute(code, mode='fp')
  assert(result == (1.0 + 2.0))
  
  return
예제 #6
0
파일: bi.py 프로젝트: tmaone/efi
def bi_bug():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    stop_inst = SignedWord(0x200D)
    stop_addr = SignedWord(0x0)

    spu.stqa(stop_inst, 0x0)
    spu.bi(stop_addr)
    spu.stop(0x200A)

    r = proc.execute(code)
    assert (r == 0xD)

    return
예제 #7
0
파일: bi.py 프로젝트: KapilRijhwani/corepy
def bi_bug():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    stop_inst = SignedWord(0x200D)
    stop_addr = SignedWord(0x0)

    spu.stqa(stop_inst, 0x0)
    spu.bi(stop_addr)
    spu.stop(0x200A)

    r = proc.execute(code)
    assert r == 0xD

    return
예제 #8
0
파일: spu_labels.py 프로젝트: tmaone/efi
def SimpleSPU():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    #x    = code.acquire_register()
    x = code.gp_return
    test = code.acquire_register()

    lbl_brz = code.get_label("BRZ")
    lbl_skip = code.get_label("SKIP")

    spu.hbrr(lbl_brz, lbl_skip)
    spu.xor(x, x, x)  # zero x
    spu.ai(x, x, 11)  # x = x + 11
    spu.ai(x, x, 31)  # x = x + 31

    spu.ceqi(test, x, 42)  # test = (x == 42)

    # If test is false (all 0s), skip the stop(0x100A) instruction
    code.add(lbl_brz)
    spu.brz(test, lbl_skip)
    spu.stop(0x100A)
    code.add(lbl_skip)
    spu.stop(0x100B)

    code.print_code(hex=True, pro=True, epi=True)
    r = proc.execute(code, mode='int', stop=True)
    print "ret", r
    assert (r[0] == 42)
    assert (r[1] == 0x100A)

    code = InstructionStream()
    spu.set_active_code(code)

    lbl_loop = code.get_label("LOOP")
    lbl_break = code.get_label("BREAK")

    r_cnt = code.acquire_register()
    r_stop = code.acquire_register()
    r_cmp = code.acquire_register()
    r_foo = code.gp_return

    spu.ori(r_foo, code.r_zero, 0)
    spu.ori(r_cnt, code.r_zero, 0)
    util.load_word(code, r_stop, 10)

    code.add(lbl_loop)

    spu.ceq(r_cmp, r_cnt, r_stop)
    spu.brnz(r_cmp, lbl_break)
    spu.ai(r_cnt, r_cnt, 1)

    spu.a(r_foo, r_foo, r_cnt)

    spu.br(lbl_loop)
    code.add(lbl_break)

    code.print_code()
    r = proc.execute(code, mode='int', stop=True)
    print "ret", r
    assert (r[0] == 55)

    return
예제 #9
0
파일: spu_basics.py 프로젝트: tmaone/efi
def SpeedTest(n_spus=6, n_floats=6):
    """
  Get a rough estimate of the maximum flop count.
  On a PS3 using all 6 spus, this is 152 GFlops.
  """

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    spu.set_active_code(code)

    f_range = range(n_floats)
    a = [SingleFloat(0.0) for i in f_range]
    b = [SingleFloat(0.0) for i in f_range]
    c = [SingleFloat(0.0) for i in f_range]
    t = [SingleFloat(0.0) for i in f_range]

    outer = 2**12
    inner = 2**16
    unroll = 128
    fuse = 2
    simd = 4
    for x in syn_iter(code, outer):
        for y in syn_iter(code, inner):
            for u in range(unroll):
                for i in f_range:
                    t[i].v = spu.fma.ex(a[i], b[i], c[i])

    # Run the synthetic program and copy the results back to the array
    # TODO - AWF - use the SPU decrementers to time this
    proc = Processor()
    start = time.time()
    r = proc.execute(code, n_spus=n_spus)
    stop = time.time()
    total = stop - start
    n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(
        fuse) * long(simd) * long(n_spus)
    print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

    #   # Run the native program and copy the results back to the array
    #   outer = 2**14
    #   inner = 2**16
    #   unroll = 1
    #   fuse = 1
    #   simd = 1

    #   proc = Processor()
    #   # ncode = NativeInstructionStream("a.out")
    #   start = time.time()
    #   r = proc.execute(ncode, n_spus = n_spus)
    #   stop = time.time()
    #   total = stop - start
    #   n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
    #   print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

    results = """
  --> No optimizations
  Executing native code: a.out
  14.805322 sec, 20.89 GFlops

  --> Synthetic
  Platform: linux.spre_linux_spu
  no raw data
  65.023350 sec, 152.19 GFlops

  --> -O3 (fuse: 2, simd: 4)
  Executing native code: a.out
  7.407939 sec, 41.74 GFlops

  --> -O3 (fuse: 1, simd: 1)
  Executing native code: a.out
  7.403702 sec, 5.22 GFlops
  """
    return