예제 #1
0
def TestVecIter(n_spus=1):
    n = 1024
    a = extarray.extarray('I', range(n))

    buffer_size = 16

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)

    stream = stream_buffer(code,
                           a.buffer_info()[0],
                           n * 4,
                           buffer_size,
                           0,
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    md = memory_desc('i', 0, buffer_size)

    for buffer in stream:
        for current in spu_vec_iter(code, md):
            current.v = current + current

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, n):
        assert (a[i] == i + i)

    return
예제 #2
0
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = extarray.extarray('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
예제 #3
0
def TestContinueLabel(n_spus=1):
    n = 1024
    a = extarray.extarray('I', range(n))

    buffer_size = 16

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)
    test = var.SignedWord(0, code)
    four = var.SignedWord(4, code)

    stream = stream_buffer(code,
                           a.buffer_info()[0],
                           n * 4,
                           buffer_size,
                           0,
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    md = memory_desc('i', 0, buffer_size)
    lsa_iter = spu_vec_iter(code, md)

    for buffer in stream:
        for current in lsa_iter:
            current.v = current + current

            test.v = (current == four)
            code.add(spu.gbb(test, test))
            #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue
            #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl))
            code.add(spu.brz(test.reg, lsa_iter.continue_label))
            current.v = current + current

        #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next))

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, n):
        if i >= 4:
            assert (a[i] == i + i)
        else:
            #print a[i]
            assert (a[i] == i * 4)
    return
예제 #4
0
def TestStreamBufferSingle(n_spus=1):
    n = 1024
    a = extarray.extarray('I', range(n))
    buffer_size = 128

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)

    addr = a.buffer_info()[0]
    stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save=True)
    if n_spus > 1: stream = parallel(stream)

    #r_bufsize = code.acquire_register()
    #r_lsa = code.acquire_register()
    #r_current = code.acquire_register()

    for buffer in stream:
        #util.load_word(code, r_bufsize, buffer_size)
        #code.add(spu.il(r_lsa, 0))

        #loop = code.size()

        #code.add(spu.lqx(r_current, buffer, r_lsa))
        #code.add(spu.a(r_current, r_current, r_current))
        #code.add(spu.stqx(r_current, buffer, r_lsa))

        #code.add(spu.ai(r_bufsize, r_bufsize, -16))
        #code.add(spu.ai(r_lsa, r_lsa, 16))
        #code.add(spu.brnz(r_bufsize, loop - code.size()))

        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current + current
            #current.v = 5
            code.add(spu.stqx(current, lsa, buffer))

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, n):
        assert (a[i] == i + i)

    return
예제 #5
0
def TestStreamBufferDouble(n_spus=1):
    n = 2048
    a = extarray.extarray('I', range(n))

    buffer_size = 32

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)

    addr = a.buffer_info()[0]
    n_bytes = n * 4
    #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    for buffer in stream:
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current + current
            code.add(spu.stqx(current, lsa, buffer))

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, len(a)):
        assert (a[i] == i + i)

    return
예제 #6
0
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True):
  import time
  # n_spus = 8
  # buffer_size = 16 # 16 ints/buffer
  # n_buffers   = 4  # 4 buffers/spu
  # n_buffers = size / buffer_size
  # size = buffer_size * n_buffers * n_spus
  # data = array.array('I', range(size + 2))

  #data = env.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16)

  code = env.ParallelInstructionStream()
  # code = env.InstructionStream()

  r_zero    = code.acquire_register()
  r_ea_data = code.acquire_register()
  r_ls_data = code.acquire_register()
  r_size    = code.acquire_register()
  r_tag     = code.acquire_register()  

  # Load zero
  util.load_word(code, r_zero, 0)

  # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0]))
  # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % (
  #   r_zero, r_ea_data, r_ls_data, r_size, r_tag)

  # Load the effective address
  if data.buffer_info()[0] % 16 == 0:
    util.load_word(code, r_ea_data, data.buffer_info()[0])
  else: 
    util.load_word(code, r_ea_data, data.buffer_info()[0] + 8)

  ea_start = data.buffer_info()[0]
  # Iterate over each buffer
  for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)):
    # ea = var.SignedWord(code = code, reg = r_ea_data)
  
    # print 'n_iters:', size / buffer_size
    # for i in syn_range(code, size / buffer_size):

    # code.add(spu.stop(0xB))
  
    # Load the size
    util.load_word(code, r_size, buffer_size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, ea, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1<<12);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    count = var.SignedWord(0, code)
    # Use an SPU iter
    for lsa in syn_iter(code, buffer_size * 4, 16):
      code.add(spu.lqx(current, r_zero, lsa))
      # code.add(spu.ai(1, r_current, r_current))
      current.v = current + current
      code.add(spu.stqx(current, r_zero, lsa))    
      count.v = count + 1

    code.add(spu.stqx(count, r_zero, 0))
  
    # code.release_register(r_current)
    current.release_registers(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag)

    # Set the tag bit to 13
    dma.mfc_write_tag_mask(code, 1<<13);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);


    # code.add(spu.stop(0xB))

    # Update ea
    # ea.v = ea + (buffer_size * 4)
  # /for ea address 


  # Cleanup
  code.release_register(r_zero)
  code.release_register(r_ea_data)
  code.release_register(r_ls_data)  
  code.release_register(r_size)
  code.release_register(r_tag)  

  if not run_code:
    return code

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  proc = env.Processor()
  #data.copy_from(data_array.buffer_info()[0], len(data_array))  
  def print_blocks():
    for i in range(0, size, buffer_size):
      # print data[i:(i + buffer_size)]
      print data[i + buffer_size],
    print '' 
  
  # print_blocks()
  s = time.time()
  r = proc.execute(code, n_spus = n_spus)
  # r = proc.execute(code)
  t = time.time() - s
  # print_blocks()

  return t