예제 #1
0
def TestStreamBufferDouble(n_spus = 1):
  n = 2048
  a = extarray.extarray('I', range(n))
  
  buffer_size = 32

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  n_bytes = n * 4
  #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  for buffer in stream:
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      code.add(spu.stqx(current, lsa, buffer))

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, len(a)):
    assert(a[i] == i + i)
  
  return
예제 #2
0
파일: spu_basics.py 프로젝트: tmaone/efi
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = array.array('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = ParallelInstructionStream()
    else: code = InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
예제 #3
0
def DoubleBufferExample(n_spus = 6):
  """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
  n = 30000
  buffer_size = 16

  # Create an array and align the data
  a = extarray.extarray('I', range(n))

  addr = a.buffer_info()[0]  
  n_bytes = n * 4

  if n_spus > 1:  code = env.ParallelInstructionStream()
  else:           code = env.InstructionStream()

  current = SignedWord(0, code)
  two = SignedWord(2, code)

  # Create the stream buffer, parallelizing it if using more than 1 SPU
  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  # Loop over the buffers
  for buffer in stream:

    # Create an iterators that computes the address offsets within the
    # buffer.  Note: this will be supported by var/vec iters soon.
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current - two
      code.add(spu.stqx(current, lsa, buffer))

  # Run the synthetic program and copy the results back to the array 
  proc = env.Processor()
  r = proc.execute(code, n_spus = n_spus)

  for i in range(2, len(a)):
    try:
      assert(a[i] == i - 2)
    except:
      print 'DoubleBuffer error:', a[i], i - 2
  
  return
예제 #4
0
def TestStreamBufferSingle(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  buffer_size = 128

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  #r_bufsize = code.acquire_register()
  #r_lsa = code.acquire_register()
  #r_current = code.acquire_register()
  
  for buffer in stream:
    #util.load_word(code, r_bufsize, buffer_size)
    #code.add(spu.il(r_lsa, 0))

    #loop = code.size()
    
    #code.add(spu.lqx(r_current, buffer, r_lsa))
    #code.add(spu.a(r_current, r_current, r_current))
    #code.add(spu.stqx(r_current, buffer, r_lsa))

    #code.add(spu.ai(r_bufsize, r_bufsize, -16))
    #code.add(spu.ai(r_lsa, r_lsa, 16))
    #code.add(spu.brnz(r_bufsize, loop - code.size()))

    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      #current.v = 5
      code.add(spu.stqx(current, lsa, buffer))
      

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)
  
  return
예제 #5
0
  spu.ori(code.gp_return, reg, 0)


  spu.il(r_cnt, 0)
  spu.il(r_sum, 16 * 4)

  r_data = prgm.acquire_register()
  r_cmp = prgm.acquire_register()
  r_lsa = prgm.acquire_register()

  spu.il(r_lsa, 0x1000)

  lbl_incloop = prgm.get_label("incloop")
  code.add(lbl_incloop)

  spu.lqx(r_data, r_cnt, r_lsa)
  spu.ai(r_data, r_data, 2)
  spu.stqx(r_data, r_cnt, r_lsa)

  spu.ai(r_cnt, r_cnt, 16)
  spu.ceq(r_cmp, r_cnt, r_sum)
  spu.brz(r_cmp, lbl_incloop)

  dma.spu_write_out_mbox(code, code.r_zero)

  prgm += code

  t3 = time.time()
  id = proc.execute(prgm, async = True, mode = 'int')

예제 #6
0
    reg = dma.spu_read_signal1(code)
    spu.ori(code.gp_return, reg, 0)

    spu.il(r_cnt, 0)
    spu.il(r_sum, 16 * 4)

    r_data = prgm.acquire_register()
    r_cmp = prgm.acquire_register()
    r_lsa = prgm.acquire_register()

    spu.il(r_lsa, 0x1000)

    lbl_incloop = prgm.get_label("incloop")
    code.add(lbl_incloop)

    spu.lqx(r_data, r_cnt, r_lsa)
    spu.ai(r_data, r_data, 2)
    spu.stqx(r_data, r_cnt, r_lsa)

    spu.ai(r_cnt, r_cnt, 16)
    spu.ceq(r_cmp, r_cnt, r_sum)
    spu.brz(r_cmp, lbl_incloop)

    dma.spu_write_out_mbox(code, code.r_zero)

    prgm += code

    t3 = time.time()
    id = proc.execute(prgm, async=True, mode='int')

    t1 = time.time()
예제 #7
0
def TestSPUIter():
  size = 32
  data = extarray.extarray('I', range(size))
  prgm = env.Program()
  code = prgm.get_stream()

  r_ea_data = prgm.acquire_register()
  r_ls_data = prgm.acquire_register()
  r_size    = prgm.acquire_register()
  r_tag     = prgm.acquire_register()  

  #print 'array ea: %X' % (data.buffer_info()[0])
  #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
  #  str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))
  
  # Load the effective address
  util.load_word(code, r_ea_data, data.buffer_info()[0])

  # Load the size
  util.load_word(code, r_size, size * 4)

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 12))

  # Load the lsa
  code.add(spu.ai(r_ls_data, code.r_zero, 0))

  # Load the data into address 0
  dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<12);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Increment the data values by 1 using an unrolled loop (no branches)
  # r_current = code.acquire_register()
  current = var.SignedWord(0, code)
  
  # Use an SPU iter
  for lsa in syn_iter(code, size * 4, 16):
    code.add(spu.lqx(current, code.r_zero, lsa))
    # code.add(spu.ai(1, r_current, r_current))
    current.v = current + current
    code.add(spu.stqx(current, code.r_zero, lsa))    

  # code.prgm.release_register(r_current)
  #current.release_register(code)
  
  # Store the values back to main memory

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 13))

  # Load the data into address 0
  dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<13);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Cleanup
  prgm.release_register(r_ea_data)
  prgm.release_register(r_ls_data)  
  prgm.release_register(r_size)
  prgm.release_register(r_tag)  

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm)

  for i in range(0, size):
    assert(data[i] == i + i)

  return
예제 #8
0
 def load_current(self):
   return self.code.add(spu.lqx(self.r_current, self.r_addr, self.r_count))
예제 #9
0
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True):
  import time
  # n_spus = 8
  # buffer_size = 16 # 16 ints/buffer
  # n_buffers   = 4  # 4 buffers/spu
  # n_buffers = size / buffer_size
  # size = buffer_size * n_buffers * n_spus
  # data = array.array('I', range(size + 2))

  #data = env.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16)

  code = env.ParallelInstructionStream()
  # code = env.InstructionStream()

  r_zero    = code.acquire_register()
  r_ea_data = code.acquire_register()
  r_ls_data = code.acquire_register()
  r_size    = code.acquire_register()
  r_tag     = code.acquire_register()  

  # Load zero
  util.load_word(code, r_zero, 0)

  # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0]))
  # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % (
  #   r_zero, r_ea_data, r_ls_data, r_size, r_tag)

  # Load the effective address
  if data.buffer_info()[0] % 16 == 0:
    util.load_word(code, r_ea_data, data.buffer_info()[0])
  else: 
    util.load_word(code, r_ea_data, data.buffer_info()[0] + 8)

  ea_start = data.buffer_info()[0]
  # Iterate over each buffer
  for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)):
    # ea = var.SignedWord(code = code, reg = r_ea_data)
  
    # print 'n_iters:', size / buffer_size
    # for i in syn_range(code, size / buffer_size):

    # code.add(spu.stop(0xB))
  
    # Load the size
    util.load_word(code, r_size, buffer_size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, ea, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1<<12);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    count = var.SignedWord(0, code)
    # Use an SPU iter
    for lsa in syn_iter(code, buffer_size * 4, 16):
      code.add(spu.lqx(current, r_zero, lsa))
      # code.add(spu.ai(1, r_current, r_current))
      current.v = current + current
      code.add(spu.stqx(current, r_zero, lsa))    
      count.v = count + 1

    code.add(spu.stqx(count, r_zero, 0))
  
    # code.release_register(r_current)
    current.release_registers(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag)

    # Set the tag bit to 13
    dma.mfc_write_tag_mask(code, 1<<13);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);


    # code.add(spu.stop(0xB))

    # Update ea
    # ea.v = ea + (buffer_size * 4)
  # /for ea address 


  # Cleanup
  code.release_register(r_zero)
  code.release_register(r_ea_data)
  code.release_register(r_ls_data)  
  code.release_register(r_size)
  code.release_register(r_tag)  

  if not run_code:
    return code

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  proc = env.Processor()
  #data.copy_from(data_array.buffer_info()[0], len(data_array))  
  def print_blocks():
    for i in range(0, size, buffer_size):
      # print data[i:(i + buffer_size)]
      print data[i + buffer_size],
    print '' 
  
  # print_blocks()
  s = time.time()
  r = proc.execute(code, n_spus = n_spus)
  # r = proc.execute(code)
  t = time.time() - s
  # print_blocks()

  return t