Пример #1
0
def ParMD5Final(pardigest, parcontext):
  num = parcontext.number

  #print map(hex, [parPADDING[k*64] for k in range(num)])
  parbits = extarray.extarray('B', 8*num)
  ParEncode(num, parbits, parcontext.count, 8)
  index = (parcontext.count[0] // 8) % 64
  #import pdb
  #pdb.set_trace()
  if index < 56:
    padLen = 56 - index
  else:
    padLen = 120 - index
  parPADDING = extarray.extarray('B', padLen*num)
  for k in range(num):
    for i in range(padLen):
      parPADDING[k*padLen + i] = 0
    parPADDING[k*padLen] = 128
  ParMD5Update(parcontext, parPADDING, padLen)
  ParMD5Update(parcontext, parbits, 8)
  state = extarray.extarray('I', 4*num)
  for k in range(num):
    state[k*4 + 0] = parcontext.statea[k]
    state[k*4 + 1] = parcontext.stateb[k]
    state[k*4 + 2] = parcontext.statec[k]
    state[k*4 + 3] = parcontext.stated[k]
  #print map(hex, state)
  ParEncode(num, pardigest, state, 16)
Пример #2
0
def MD5Final(digest, context):
  PADDING = extarray.extarray('B', 64)
  for i in range(64):
    PADDING[i] = 0
  PADDING[0] = 128
  bits = extarray.extarray('B', 8)
  Encode(bits, context.count, 8)
  index = (context.count[0] // 8) % 64
  if index < 56:
    padLen = 56 - index
  else:
    padLen = 120 - index
  MD5Update(context, PADDING, padLen)
  MD5Update(context, bits, 8)
  print map(hex, context.state)
  Encode(digest, context.state, 16)
Пример #3
0
    def alloc_host(self, typecode, length, comps=1):
        """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
        # fmt = self._get_fmt(typecode, comps)

        array_typecode = ""

        # This might be clearer, but not very efficient...
        # type_conversion_table = {}
        # type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'}
        # type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'}
        # type_conversion_table['16'] = {'u': 'H', 's', 'h'}
        # type_conversion_table['8'] = {'u': 'B', 's', 'b'}
        #
        # if typecode == 'b':
        #  typecode = 'u'
        # array_typecode = type_conversion_table[typecode[0]][typecode[1:]]

        scalar_width = int(typecode[1:])
        if typecode[0] == "f":
            if scalar_width == 32:
                array_typecode = "f"
            elif scalar_width == 64:
                array_typecode = "d"
        elif typecode[0] == "u":
            if scalar_width == 32:
                array_typecode = "I"
            elif scalar_width == 64:
                array_typecode = "L"
            elif scalar_width == 16:
                array_typecode = "H"
            elif scalar_width == 8:
                array_typecode = "b"
        elif typecode[0] == "s":
            if scalar_width == 32:
                array_typecode = "i"
            elif scalar_width == 64:
                array_typecode = "l"
            elif scalar_width == 16:
                array_typecode = "h"
            elif scalar_width == 8:
                array_typecode = "B"

        if array_typecode == "":
            raise Exception("Unable to convert type")

        mem = ptx_exec.alloc_host(length * scalar_byte_width * comps)

        arr = extarray.extarray(array_typecode, 0)
        arr.data_len = scalar_width / 4 * length * comps
        arr.set_memory(mem, arr.data_len * 4)
        arr.gpu_mem_handle = mem
        #    arr.gpu_device = self.device
        arr.gpu_width = length
        #     arr.gpu_pitch = mem[2]
        #     arr.gpu_height = height
        return arr
Пример #4
0
def ParallelTests():
  max_exp = 16
  max_size = pow(2, max_exp)
  print 'Creating data...'
  data = extarray.extarray('I', range(max_size))
  
  print 'Executing Tests...'
  # t = TestSPUParallelIter(data, 8192, n_spus = 1, buffer_size = 128)
  # return 

  i = 0
  for exponent in range(13, max_exp + 1):
    size = pow(2, exponent)
    for n_spus in [1, 2, 4]:

      # Increase the buffer size until to the largest possible factor for the
      # number of SPUs or 4096 (*4=16k), whichever is smaller
      for buffer_exp in range(2, min(exponent - LOG[n_spus] - 2, 12)):
        buffer_size = pow(2, buffer_exp)
        # for buffer_size in [4]:
        t = 0.0
        print 'try\t%d\t%d\t%d\t-.-' % (size, n_spus, buffer_size)
        # for i in range(10):
        t += TestSPUParallelIter(data, size, n_spus = n_spus, buffer_size = buffer_size)
        
        print 'test\t%d\t%d\t%d\t%.8f' % (size, n_spus, buffer_size, t / 10.0)
        # print 'count:', i
        i += 1
  return
Пример #5
0
def TestParams():
    import time

    # SIZE = 1024
    kernel = """
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  """

    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    a = 1.0
    b = 2.0

    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    # mem.set_memory(ptx_mem_addr, 4)
    mem[0] = 5.0

    print ptx_mem_addr, type(ptx_mem_addr)
    print mem.buffer_info()[0], type(mem.buffer_info()[0])
    param_list = [ptx_mem_addr, a, b]
    # image, dev num, (x, y, w, h)

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "X", mem.buffer_info()[0], ptx_mem_addr
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem

    # ptx_exec.free(input)
    # ptx_exec.free(output)
    ##ptx_exec.free(glob)
    # ptx_exec.unload_module(image)
    return
Пример #6
0
def TestVecIter(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  
  buffer_size = 16

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  md = memory_desc('i', 0, buffer_size)

  for buffer in stream:
    for current in spu_vec_iter(code, md):
      current.v = current + current

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)

  return
Пример #7
0
def TestStreamBufferDouble(n_spus = 1):
  n = 2048
  a = extarray.extarray('I', range(n))
  
  buffer_size = 32

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  n_bytes = n * 4
  #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  for buffer in stream:
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      code.add(spu.stqx(current, lsa, buffer))

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, len(a)):
    assert(a[i] == i + i)
  
  return
Пример #8
0
def load_double(code, reg, val):
  data = extarray.extarray('d', (val,))
  data.change_type('L')

  # reg better be an mmx or xmm, should we check?
  code.add(x86.push(data[0]))
  code.add(x86.pshufd(reg, mem.MemRef(regs.rsp, data_size = 128), 0x44))
  return code.add(x86.add(regs.rsp, 8))
Пример #9
0
def get_asm_glue(dest):
    l = [72, 184]
    l.extend(struct.unpack("8B", struct.pack("l", ctypes.cast(dest, ctypes.c_void_p).value)))
    l.extend([73, 84, 73, 137, 228, 72, 131, 228, 240, 72, 255, 208, 76, 137, 228, 73, 92, 72, 88, 72, 131, 232, 13, 72, 255, 224])
    l = extarray('B', l)
    make_executable(*l.buffer_info())
    l.references.append(dest)
    return l
Пример #10
0
def load_float(code, reg, val, clear = False):
  data = extarray.extarray('f', (val,))
  data.change_type('I')

  # reg better be an mmx or xmm, should we check?
  code.add(x86.push(data[0]))
  code.add(x86.pshufd(reg, mem.MemRef(regs.rsp, data_size = 128), 0))
  return code.add(x86.add(regs.rsp, 8))
Пример #11
0
  def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size):

    # Setup the range parameter array
    r1_inc = (r1_range[1] - r1_range[0]) / size[0]
    r2_inc = (r2_range[1] - r2_range[0]) / size[1]

    ranges = extarray.extarray('f', [0.0] * 16)
    for i in range(4):
      ranges[i]      = r1_range[0]
      ranges[4 + i]  = r2_range[0]
      ranges[8 + i]  = r1_inc
      ranges[12 + i] = r2_inc

    # Setup the pattern vector
    bits = _pattern2vector(pattern)

    # Copy the paramters to aligned buffers
    #a_ranges = synspu.aligned_memory(len(ranges), typecode='I')
    #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges))

    #a_pattern = synspu.aligned_memory(len(bits), typecode='I')
    #a_pattern.copy_to(bits.buffer_info()[0], len(bits))

    renderer = MailboxRenderer()
    ly_block = LyapunovBlock()

    ly_block.set_size(size[0], size[1])
    #ly_block.set_range(a_ranges)
    #ly_block.set_pattern(a_pattern)
    ly_block.set_range(ranges)
    ly_block.set_pattern(bits)
    ly_block.set_max_init(max_init)
    ly_block.set_max_n(max_n)
    ly_block.set_renderer(renderer)

    code = synspu.InstructionStream()
    ly_block.synthesize(code)

    proc = synspu.Processor()

    spe_id = proc.execute(code, async=True)

    for i in range(size[0] * size[1]):
      while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
      print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    # for x in range(size[0]):
    #   r2 = r2_range[0] + r2_inc
    #   print 'col:', x, r1, r2
    
    #   for y in range(size[1]):
    #     results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n)
    #     r2 += r2_inc      
    #   r1 += r1_inc      

    return
Пример #12
0
  def _set_literal_value(self, value):
    if type(value) is _array_type:
      if value.typecode not in self.array_typecodes:
        raise Exception("Array typecode '%s' is not supported" % (value.typecode,))

      if len(value) < INT_ARRAY_SIZES[self.array_typecode]:
        print 'Warning: Variable array initializer has fewer elements than the corresponding vector: %d < %d' % (
          len(value), INT_ARRAY_SIZES[self.array_typecode])
      util.load_vector(self.code, self.reg, value.buffer_info()[0])
      self.storage = value

    elif type(value) in (list, tuple):
      if len(value) < INT_ARRAY_SIZES[self.array_typecode]:
        print 'Warning: Variable %s initializer has fewer elements than the corresponding vector: %d < %d' % (
          type(value), len(value), INT_ARRAY_SIZES[self.array_typecode])
      
      storage = extarray.extarray(self.array_typecode, value)
      util.load_vector(self.code, self.reg, storage.buffer_info()[0])
      self.storage = storage
      
    elif type(value) in self.literal_types:
      if (value & 0x1F) == value and isinstance(self, (SignedByteType, SignedHalfwordType, SignedWordType)):
        # Use the splat instructions
        if isinstance(self, SignedByteType):
          self.code.add(vmx.vspltisb(self.reg, value))
        elif isinstance(self, SignedHalfwordType):
          self.code.add(vmx.vspltish(self.reg, value))
        elif isinstance(self, SignedWordType):
          self.code.add(vmx.vspltisw(self.reg, value))
        else:
          raise Exception('Unsupported typecode for vector literal splat: ' + str(type(self)))
      else:
        splat = [self.value for i in xrange(INT_ARRAY_SIZES[self.array_typecode])]
        vsplat = extarray.extarray(self.array_typecode, splat)

        util.load_vector(self.code, self.reg, vsplat.buffer_info()[0])
        self.code.prgm.add_storage(vsplat)
        self.storage = vsplat
        
    self.value = value

    if self.storage is not None:
      self.code.prgm.add_storage(self.storage)

    return
Пример #13
0
def TestParamsFull():
    import time
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()

    a = 1.0
    b = 2.0

    ptx_mem_addr = proc.alloc_device("f32", 1)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    param_list = [ptx_mem_addr.address, a, b]

    proc.copy(ptx_mem_addr, mem)
    prgm.cache_code()
    for i in range(20):
        t1 = time.time()
        proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
        t2 = time.time()
        print "run time", t2 - t1
        print "#####"
    print "X", mem.buffer_info()[0], ptx_mem_addr.address
    proc.copy(mem, ptx_mem_addr)

    print param_list
    print mem

    return
Пример #14
0
def MD5(s):
  digest = extarray.extarray('B', 16)
  length = len(s)
  context = Context()

  MD5Init(context)
  MD5Update(context, s, length)
  MD5Final(digest, context)

  print map(hex, map(int, digest))
Пример #15
0
def py_nbody():
  global x, y, vx, vy, m

  x = extarray.extarray('f', N_BODIES)
  y = extarray.extarray('f', N_BODIES)
  vx = extarray.extarray('f', N_BODIES)
  vy = extarray.extarray('f', N_BODIES)
  m = extarray.extarray('f', N_BODIES)
 
  for i in xrange(0, N_BODIES): 
    x[i] = random.uniform(-1.0, 1.0)
    y[i] = random.uniform(-1.0, 1.0)
    #vx[i] = random.uniform(-1.0, 1.0)
    #vy[i] = random.uniform(-1.0, 1.0)
    vx[i] = 0.0
    vy[i] = 0.0
    m[i] = random.uniform(1e9, 1e10)

  return
Пример #16
0
  def join(self, ti):
    if not isinstance(ti, spu_exec.ThreadInfo):
      raise Exception('Invalid thread handle: ' + str(ti))

    spu_exec.wait_stream(ti)

    if ti.mode == self.MODE_INT:
      regs = extarray.extarray('I', 128 * 4)
      spu_exec.get_spu_registers(ti, regs.buffer_info()[0])
      retval = int(regs[4])
    elif ti.mode == self.MODE_FP:
      regs = extarray.extarray('f', 128 * 4)
      spu_exec.get_spu_registers(ti, regs.buffer_info()[0])
      retval = float(regs[4])
    else:
      retval = None

    if ti.stop != 0:
      retval = (retval, spu_exec.get_result(ti))

    spu_exec.free_context(ti)
    return retval
Пример #17
0
    def join(self, ti):
        if not isinstance(ti, spu_exec.Context):
            raise TypeError("Invalid context: " + str(ti))

        spu_exec.wait_stream(ti)

        if ti.mode == self.MODE_INT:
            regs = extarray.extarray("I", 128 * 4)
            spu_exec.get_spu_registers(ti, regs.buffer_info()[0])
            retval = int(regs[4])
        elif ti.mode == self.MODE_FP:
            regs = extarray.extarray("f", 128 * 4)
            spu_exec.get_spu_registers(ti, regs.buffer_info()[0])
            retval = float(regs[4])
        else:
            retval = None

        if ti.stop != 0:
            retval = (retval, spu_exec.get_result(ti))

        spu_exec.free_context(ti)
        return retval
Пример #18
0
def DoubleBufferExample(n_spus = 6):
  """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
  n = 30000
  buffer_size = 16

  # Create an array and align the data
  a = extarray.extarray('I', range(n))

  addr = a.buffer_info()[0]  
  n_bytes = n * 4

  if n_spus > 1:  code = env.ParallelInstructionStream()
  else:           code = env.InstructionStream()

  current = SignedWord(0, code)
  two = SignedWord(2, code)

  # Create the stream buffer, parallelizing it if using more than 1 SPU
  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  # Loop over the buffers
  for buffer in stream:

    # Create an iterators that computes the address offsets within the
    # buffer.  Note: this will be supported by var/vec iters soon.
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current - two
      code.add(spu.stqx(current, lsa, buffer))

  # Run the synthetic program and copy the results back to the array 
  proc = env.Processor()
  r = proc.execute(code, n_spus = n_spus)

  for i in range(2, len(a)):
    try:
      assert(a[i] == i - 2)
    except:
      print 'DoubleBuffer error:', a[i], i - 2
  
  return
Пример #19
0
 def __init__(self, number):
   self.number = number
   self.statea = extarray.extarray('I', number)
   self.stateb = extarray.extarray('I', number)
   self.statec = extarray.extarray('I', number)
   self.stated = extarray.extarray('I', number)
   self.count = extarray.extarray('I', 2*number)
   self.buffer = extarray.extarray('B', 64*number)
Пример #20
0
  def setup(self, code):
    if self.addr is None: raise Exception('Please set addr')
    if self._stride is None: raise Exception('Please set stride')        

    self.x_offset = var.Word(0)
    self.y_offset = var.Word(self.addr)
    self.stride = var.Word(self._stride * 4)

    # Mask to extract the lowest 2 bytes from each word in the first vector
    # into RGB and the first byte from the second vector into A
    self.uint2rgba = var.Word(extarray.extarray('I', [0x01030303, 0x10070707, 0x100B0B0B, 0x100F0F0F]))
    self.ff = var.Word(0xFF000000)

    return
Пример #21
0
 def __init__(self):
   
   # Code and memory buffers
   self.code = env.InstructionStream()
   self.regs = extarray.extarray('I', 128 * 4)
   self.regs.clear()
   
   # Runtime parameters
   self.speid = None
   self.reg_lsa = None
   self.proc = None
   
   self.synthesize()
   
   return
Пример #22
0
def TestStreamBufferSingle(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  buffer_size = 128

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  #r_bufsize = code.acquire_register()
  #r_lsa = code.acquire_register()
  #r_current = code.acquire_register()
  
  for buffer in stream:
    #util.load_word(code, r_bufsize, buffer_size)
    #code.add(spu.il(r_lsa, 0))

    #loop = code.size()
    
    #code.add(spu.lqx(r_current, buffer, r_lsa))
    #code.add(spu.a(r_current, r_current, r_current))
    #code.add(spu.stqx(r_current, buffer, r_lsa))

    #code.add(spu.ai(r_bufsize, r_bufsize, -16))
    #code.add(spu.ai(r_lsa, r_lsa, 16))
    #code.add(spu.brnz(r_bufsize, loop - code.size()))

    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      #current.v = 5
      code.add(spu.stqx(current, lsa, buffer))
      

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)
  
  return
Пример #23
0
def TestVecIter():
  prgm = synppc.Program()
  code = prgm.get_stream()
  prgm.add(code)
  ppc.set_active_code(code)
  
  a = extarray.extarray('I', range(16))
  for i in vector_iter(code, a):
    i.v = vmx.vadduws.ex(i, i)

  ai = extarray.extarray('i', range(16))
  for i in vector_iter(code, ai):
    i.v = vmx.vaddsws.ex(i, i) 

  b = extarray.extarray('H', range(16))
  for i in vector_iter(code, b):
    i.v = vmx.vadduhs.ex(i, i) 

  bi = extarray.extarray('h', range(16))
  for i in vector_iter(code, bi):
    i.v = vmx.vaddshs.ex(i, i) 

  c = extarray.extarray('B', range(16))
  for i in vector_iter(code, c):
    i.v = vmx.vaddubs.ex(i, i) 

  ci = extarray.extarray('b', range(16))
  for i in vector_iter(code, ci):
    i.v = vmx.vaddsbs.ex(i, i) 

  ften = vmx_vars.BitType(10.0)
  f = extarray.extarray('f', range(16))
  for i in vector_iter(code, f):
    i.v = vmx.vaddfp.ex(i, i) 

  proc = synppc.Processor()
  r = proc.execute(prgm)

  expected = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]

  _array_check(a, expected)
  _array_check(ai, expected)
  _array_check(b, expected)
  _array_check(bi, expected)
  _array_check(c, expected)
  _array_check(ci, expected)
  _array_check(f, expected)

  return
Пример #24
0
  def alloc_remote(self, typecode, comps, width, height = 1, globl = False):
    if typecode == 'f':
      if comps == 1:
        fmt = cal_exec.FMT_FLOAT32_1
      elif comps == 2:
        fmt = cal_exec.FMT_FLOAT32_2
      elif comps == 4:
        fmt = cal_exec.FMT_FLOAT32_4
      else:
        raise Exception("Number of components must be 1, 2, or 4")
    elif typecode == 'i':
      if comps == 1:
        fmt = cal_exec.FMT_SIGNED_INT32_1
      elif comps == 2:
        fmt = cal_exec.FMT_SIGNED_INT32_2
      elif comps == 4:
        fmt = cal_exec.FMT_SIGNED_INT32_4
      else:
        raise Exception("Number of components must be 1, 2, or 4")
    elif typecode == 'I':
      if comps == 1:
        fmt = cal_exec.FMT_UNSIGNED_INT32_1
      elif comps == 2:
        fmt = cal_exec.FMT_UNSIGNED_INT32_2
      elif comps == 4:
        fmt = cal_exec.FMT_UNSIGNED_INT32_4
      else:
        raise Exception("Number of components must be 1, 2, or 4")
    else:
      raise Exception("Unsupported data type: " + str(typecode))

    if globl:
      globl = cal_exec.GLOBAL_BUFFER

    # Allocate and initialize the memory
    # TODO - more operand error checking
    mem = cal_exec.alloc_remote(self.device, fmt, width, height, globl)
    arr = extarray.extarray(typecode, 0)

    arr.data_len = mem[1] * height * comps
    arr.set_memory(mem[0], arr.data_len * 4)
    arr.gpu_mem_handle = mem
    arr.gpu_device = self.device
    arr.gpu_width = width
    arr.gpu_pitch = mem[1]
    return arr
Пример #25
0
def _pattern2vector(pattern):
  """
  Encode a string of 1's and 0's into a 128-bit bit vector.
  """

  if 128 % len(pattern) != 0: raise Exception('Pattern length must be a factor of 128')
  pattern = pattern * (128 / len(pattern))

  bv = extarray.extarray('I', [0,0,0,0])

  size = 128 / 4
  for i in range(size):
    for j in range(4):
      b = int(pattern[size * j + i])
      if b == 1:
        bv[j] = bv[j] | (1 << (size - i - 1))

  return bv
Пример #26
0
def TestContinueLabel(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  
  buffer_size = 16

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)
  test    = var.SignedWord(0, code)
  four    = var.SignedWord(4, code)    

  stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  md = memory_desc('i', 0, buffer_size)
  lsa_iter = spu_vec_iter(code, md)

  for buffer in stream:
    for current in lsa_iter:
      current.v = current + current

      test.v = (current == four)
      code.add(spu.gbb(test, test))
      #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue
      #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl))
      code.add(spu.brz(test.reg, lsa_iter.continue_label))
      current.v = current + current

    #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next))
 
  prgm.add(code) 
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    if i >= 4:
      assert(a[i] == i + i)
    else:
      #print a[i]
      assert(a[i] == i * 4)
  return
Пример #27
0
def TestSynIterDec():
  import corepy.arch.ptx.isa as ptx
  import corepy.arch.ptx.types.registers as regs

  SIZE = 64

  proc = env.Processor(0)

  # build and run the kernel
  prgm = env.Program()
  code = prgm.get_stream()

  _mem = prgm.add_parameter('u64', name='_mem')

  memp = prgm.acquire_register('u64')
  counter = prgm.acquire_register('u32')
  code.add(ptx.ld('param', memp, regs.ptxAddress(_mem)))
  code.add(ptx.mov(counter, 0))
  for i in syn_iter(code, 5, step=1, mode=DEC):
    code.add(ptx.add(counter, counter, 1))
  code.add(ptx.st('global', regs.ptxAddress(memp), counter))
  prgm.add(code)

  ptx_mem_addr = proc.alloc_device('u32', 1)
  mem = extarray.extarray('I', 1)
  mem[0] = 5

  param_list = [ptx_mem_addr.address,]

  proc.copy(ptx_mem_addr, mem)
  prgm.cache_code()
  print prgm.render_string
  proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
  proc.copy(mem, ptx_mem_addr)

  print mem

  #passed = True
  #for i in xrange(0, SIZE):
  #  if ext_output[i] != 5:
  #    passed = False
  #print "Passed == ", passed

  return
Пример #28
0
  def __init__(self, app, parent, id, style, size = (-1, -1)):
    wx.ListCtrl.__init__(self, parent, id, size = size, style = style)
    listmix.TextEditMixin.__init__(self)

    self.attr = wx.ListItemAttr()
    self.attr.SetFont(wx.Font(11,
        wx.FONTFAMILY_TELETYPE, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))

    #self.attr_red = wx.ListItemAttr()
    #self.attr_red.SetFont(wx.Font(11,
    #    wx.FONTFAMILY_TELETYPE, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))
    #self.attr_red.SetTextColour(wx.RED)

    self.Bind(wx.EVT_LIST_BEGIN_LABEL_EDIT, self.OnBeginEdit)

    self.app = app
    self._cur_regs = extarray.extarray('I', 128 * 4)
    #self._prev_regs = extarray.extarray('I', 128 * 4)
    #self._prev_regs.clear()
    return
Пример #29
0
  def alloc_remote(self, typecode, comps, width, height = 1, globl = False):
    """Allocate an ExtArray backed by remote (main) memory."""
    fmt = self._get_fmt(typecode, comps)

    if globl:
      globl = cal_exec.GLOBAL_BUFFER

    # Allocate and initialize the memory
    # TODO - more operand error checking
    mem = cal_exec.alloc_remote(self.device, fmt, width, height, globl)
    arr = extarray.extarray(typecode, 0)

    arr.data_len = mem[2] * height * comps
    arr.set_memory(mem[1], arr.data_len * 4)
    arr.gpu_mem_handle = mem
    arr.gpu_device = self.device
    arr.gpu_width = width
    arr.gpu_pitch = mem[2]
    arr.gpu_height = height
    return arr
Пример #30
0
  def _startSPU(self):
    self.ctx = ctx = env.spu_exec.alloc_context()

    # Execute a no-op instruction stream so the prolog is executed
    prgm = env.Program()
    code = prgm.get_stream()
    code.add(spu.nop(code.r_zero))

    prgm.cache_code()
    itemsize = prgm.render_code.itemsize 
    code_len = len(prgm.render_code) * itemsize
    if code_len % 16 != 0:
      code_len += 16 - (code_len % 16)
    code_lsa = 0x40000 - code_len

    env.spu_exec.run_stream(ctx, prgm.inst_addr(), code_len, code_lsa, code_lsa)

    self.localstore = extarray.extarray('I', 262144 / 4)
    print "spuls %x" % (ctx.spuls), ctx.spuls, type(ctx.spuls)
    self.localstore.set_memory(ctx.spuls, 262144)
    return
Пример #31
0
    def generate(self, results, pattern, r1_range, r2_range, max_init, max_n,
                 size):

        # Setup the range parameter array
        r1_inc = (r1_range[1] - r1_range[0]) / size[0]
        r2_inc = (r2_range[1] - r2_range[0]) / size[1]

        ranges = extarray.extarray('f', [0.0] * 16)
        for i in range(4):
            ranges[i] = r1_range[0]
            ranges[4 + i] = r2_range[0]
            ranges[8 + i] = r1_inc
            ranges[12 + i] = r2_inc

        # Setup the pattern vector
        bits = _pattern2vector(pattern)

        # Copy the paramters to aligned buffers
        #a_ranges = synspu.aligned_memory(len(ranges), typecode='I')
        #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges))

        #a_pattern = synspu.aligned_memory(len(bits), typecode='I')
        #a_pattern.copy_to(bits.buffer_info()[0], len(bits))

        renderer = MailboxRenderer()
        ly_block = LyapunovBlock()

        ly_block.set_size(size[0], size[1])
        #ly_block.set_range(a_ranges)
        #ly_block.set_pattern(a_pattern)
        ly_block.set_range(ranges)
        ly_block.set_pattern(bits)
        ly_block.set_max_init(max_init)
        ly_block.set_max_n(max_n)
        ly_block.set_renderer(renderer)

        prgm = synspu.Program()
        code = prgm.get_stream()
        prgm += code

        ly_block.synthesize(code)

        proc = synspu.Processor()

        spe_id = proc.execute(prgm, async=True)

        for i in range(size[0] * size[1]):
            while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
                pass
            print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        proc.join(spe_id)

        # for x in range(size[0]):
        #   r2 = r2_range[0] + r2_inc
        #   print 'col:', x, r1, r2

        #   for y in range(size[1]):
        #     results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n)
        #     r2 += r2_inc
        #   r1 += r1_inc

        return
Пример #32
0
    def dump_regs(self):
        mbox = 28  # write out mbox channel

        # Pseudo-code:
        #  1) Save code is: (do this as an array, not an instruction stream)
        save_size = 128 * 2 + 4
        save_code = extarray.extarray('I', range(save_size))

        for i in range(0, 128 * 2, 2):
            save_code[i] = spu.wrch(i / 2, mbox, ignore_active=True).render()
            save_code[i + 1] = spu.stop(0x6, ignore_active=True).render()

        # branch back to the debug stop
        save_code[128 * 2] = spu.stop(0x7, ignore_active=True).render()
        ret = spu.bra(self.debug_lsa, ignore_active=True)
        save_code[128 * 2 + 1] = ret.render()

        #aligned_save_code = aligned_memory(save_size, typecode = 'I')
        #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code))

        #  2) Save lsa[0:len(save_code)]
        # TODO: do this with putb

        #  3) Push save code to lsa[0:]
        tag = 2
        spu_exec.spu_getb(self.spe_id, 0,
                          save_code.buffer_info()[0], save_size * 4, tag, 0, 0)
        spu_exec.read_tag_status_all(self.spe_id, 1 << tag)

        #  3) Replace the debug branch with a branch to 0
        self.replace(self.debug_branch, spu.bra(0, ignore_active=True))
        self.get_instructions()

        #  4) Resume
        self.resume(self.spe_id)

        #  5) Read the register values and send the ok signal
        regs = []
        for i in range(128):
            while spu_exec.stat_out_mbox(self.spe_id) == 0:
                pass
            value = spu_exec.read_out_mbox(self.spe_id)
            regs.append(value)

            r = spu_exec.wait_stop_event(self.spe_id)
            self.resume(self.spe_id)

        r = spu_exec.wait_stop_event(self.spe_id)
        print 'next stop', r
        #  6) Restore code at original pc
        self.restore(self.debug_branch)
        self.get_instructions()

        #  7) Restore lsa[0:len(save_code)]
        # TODO: do this with putb

        #  8) Resume
        # self.resume(self.spe_id)
        # r = spu_exec.wait_stop_event(self.spe_id)
        self.resume(self.spe_id)
        r = self.wait_debug()

        return regs
Пример #33
0
    def _synthesize_prologue(self):
        """
    Create the prologue. (see PPC ABI p41)

    This manages the register preservation requirements from the ABI.

    TODO: CR2-4 need to be preserved.
    """

        # Reset the prologue
        self._prologue = [self.lbl_prologue]

        # Add the instructions to save the registers

        r_addr = GPRegister(13)  # Only available volatile register
        r_idx = GPRegister(14)  # Non-volatile; safe to use before restoring

        # TODO - AWF - don't want to push things on the stack, that changes the
        # relative location of the passed-in arguments
        # However, we could just use the stack to save all the registers, and use
        # a frame pointer to give access to the arguments

        save_gp = [r for r in self._used_registers[GPRegister] if r in gp_save]

        if len(save_gp) > 0:
            # Save GP registers
            self._saved_gp_registers = array.array('I', xrange(len(save_gp)))

            self._load_word(self._prologue, r_addr,
                            self._saved_gp_registers.buffer_info()[0])

            for i, reg in enumerate(save_gp):
                self._prologue.append(
                    ppc.stw(reg, r_addr, i * WORD_SIZE, ignore_active=True))

        save_fp = [r for r in self._used_registers[FPRegister] if r in fp_save]

        if len(save_fp) > 0:
            # Save FP registers
            self._saved_fp_registers = array.array('d', xrange(len(save_fp)))

            self._load_word(self._prologue, r_addr,
                            self._saved_fp_registers.buffer_info()[0])

            for i, reg in enumerate(save_fp):
                self._prologue.append(
                    ppc.stfd(reg,
                             r_addr,
                             i * WORD_SIZE * 2,
                             ignore_active=True))

        if self._enable_vmx:
            save_vx = [
                r for r in self._used_registers[VMXRegister] if r in vx_save
            ]

            if len(save_vx) > 0:
                # Save VMX registers
                self._saved_vx_registers = extarray.extarray(
                    'I', range(len(save_vx) * 4))

                self._load_word(self._prologue, r_addr,
                                self._saved_vx_registers.buffer_info()[0])

                for i, reg in enumerate(save_vx):
                    self._load_word(self._prologue, r_idx, i * WORD_SIZE * 4)
                    self._prologue.append(
                        vmx.stvx(reg, r_idx, r_addr, ignore_active=True))

                # Set up VRSAVE
                # Currently, we save the old value of VRSAVE in r31.
                # On the G4, someone stomps on registers < 20 ... save them all for now.

                # Save vrsave and put our value in it
                self._prologue.append(
                    ppc.mfvrsave(self._vrsave, ignore_active=True))
                self._load_word(self._prologue, r_addr, 0xFFFFFFFF)
                self._prologue.append(ppc.mtvrsave(r_addr, ignore_active=True))

        return
Пример #34
0
def TestLiterals():
    import corepy.arch.ppc.platform as env
    prgm = env.Program()
    code = prgm.get_stream()
    prgm += code
    proc = env.Processor()

    ppc.set_active_code(code)
    vmx.set_active_code(code)

    zero = Bits.cast(SignedByte(0))

    target = Bits()

    # Signed versions use splat, unsigned arrays
    b = Byte(2)
    sb = SignedByte(-2)
    vmx.vaddsbs(b, b, sb)

    h = Halfword(9999)
    sh = SignedHalfword(-9999)
    vmx.vaddshs(h, h, sh)

    w = Word(99999)
    sw = SignedWord(-99999)
    vmx.vaddsws(w, w, sw)

    # Combine the results (should be [0,0,0,0])
    vmx.vor(target, b, h)
    vmx.vor(target, target, w)

    # Array initializers
    b = Byte(range(16))
    sb = SignedByte(range(16))
    vmx.vsubsbs(b, b, sb)
    vmx.vor(target, target, b)

    h = Halfword([9999, 9998, 9997, 9996, 9995, 9994, 9993, 9992])
    sh = SignedHalfword([9999, 9998, 9997, 9996, 9995, 9994, 9993, 9992])
    vmx.vsubshs(h, h, sh)
    vmx.vor(target, target, h)

    w = Word([99999, 99998, 99997, 99996])
    sw = SignedWord([99999, 99998, 99997, 99996])
    vmx.vsubsws(w, w, sw)

    target.v = vmx.vor.ex(target, w)

    result = extarray.extarray('I', [42, 42, 42, 42])
    r_addr = prgm.acquire_register()
    util.load_word(code, r_addr, result.buffer_info()[0])

    vmx.stvx(target, 0, r_addr)

    ppc.set_active_code(None)
    vmx.set_active_code(None)
    r = proc.execute(prgm)
    print result
    for i in result:
        assert (i == 0)
    # for i in result: print '%08X' % i,
    # print

    return
Пример #35
0
import corepy.arch.x86_64.isa as x86
from corepy.arch.x86_64.types.registers import *
import corepy.arch.x86_64.platform as env
from corepy.arch.x86_64.lib.memory import MemRef
import corepy.lib.extarray as extarray
import corepy.arch.x86_64.lib.util as util
import time

ITERS = 1000000
THREADS = 4

data = extarray.extarray('l', 1)
dbi = data.buffer_info()

# This first case is intentionally wrong to show what happens w/o locking.
data[0] = 0

prgm = env.Program()
code = prgm.get_stream()
x86.set_active_code(code)

x86.mov(rax, 1)
x86.mov(rcx, ITERS)
x86.mov(rdi, dbi[0])

lbl_loop = prgm.get_unique_label("loop")
code.add(lbl_loop)

x86.add(MemRef(rdi), rax)
x86.dec(rcx)
x86.jnz(lbl_loop)
Пример #36
0
def ParMD5Transform(parcontext, parblock, blocki):
    num = parcontext.number

    temp_block = extarray.extarray('I', 16 * num)
    ParDecode(num, temp_block, parblock, blocki, 64)

    proc = env.Processor(0)

    N = int(math.sqrt(num / 4))

    #print "N = ", N
    def address_4_1d(i, pitch=64):
        x = i % N
        y = i // 64 * 4
        #return x*4 + y*pitch*4*4
        return i

    def address_4_2d(x, y, pitch=64):
        return x * 4 + y * pitch * 4

    input_statea = proc.alloc_remote('I', 4, N, N)
    input_stateb = proc.alloc_remote('I', 4, N, N)
    input_statec = proc.alloc_remote('I', 4, N, N)
    input_stated = proc.alloc_remote('I', 4, N, N)
    input_block = [proc.alloc_remote('I', 4, N, N) for i in range(16)]
    outputa = proc.alloc_remote('I', 4, N, N)
    outputb = proc.alloc_remote('I', 4, N, N)
    outputc = proc.alloc_remote('I', 4, N, N)
    outputd = proc.alloc_remote('I', 4, N, N)

    for j in range(N):
        for i in range(N):
            for k in range(4):
                input_statea[address_4_2d(i, j) +
                             k] = parcontext.statea[k + (i + j * N) * 4]
                input_stateb[address_4_2d(i, j) +
                             k] = parcontext.stateb[k + (i + j * N) * 4]
                input_statec[address_4_2d(i, j) +
                             k] = parcontext.statec[k + (i + j * N) * 4]
                input_stated[address_4_2d(i, j) +
                             k] = parcontext.stated[k + (i + j * N) * 4]
    for k in range(N):
        for j in range(N):
            for l in range(4):
                for i in range(16):
                    input_block[i][address_4_2d(j, k) +
                                   l] = temp_block[i + (j + k * N) * 4 * 16 +
                                                   l * 16]

    global xcode
    if xcode == None:
        xcode = env.InstructionStream()
        cal.set_active_code(xcode)

        S11 = xcode.acquire_register((7, 7, 7, 7))
        S12 = xcode.acquire_register((12, 12, 12, 12))
        S13 = xcode.acquire_register((17, 17, 17, 17))
        S14 = xcode.acquire_register((22, 22, 22, 22))
        S21 = xcode.acquire_register((5, 5, 5, 5))
        S22 = xcode.acquire_register((9, 9, 9, 9))
        S23 = xcode.acquire_register((14, 14, 14, 14))
        S24 = xcode.acquire_register((20, 20, 20, 20))
        S31 = xcode.acquire_register((4, 4, 4, 4))
        S32 = xcode.acquire_register((11, 11, 11, 11))
        S33 = xcode.acquire_register((16, 16, 16, 16))
        S34 = xcode.acquire_register((23, 23, 23, 23))
        S41 = xcode.acquire_register((6, 6, 6, 6))
        S42 = xcode.acquire_register((10, 10, 10, 10))
        S43 = xcode.acquire_register((15, 15, 15, 15))
        S44 = xcode.acquire_register((21, 21, 21, 21))

        a = xcode.acquire_register()
        b = xcode.acquire_register()
        c = xcode.acquire_register()
        d = xcode.acquire_register()
        x = [xcode.acquire_register() for i in range(16)]
        r = xcode.acquire_register()

        cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # statea
        cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # stateb
        cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # statec
        cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # stated
        for i in range(16):
            cal.dcl_resource(i + 4,
                             cal.pixtex_type.twod,
                             cal.fmt.uint,
                             UNNORM=True)
        cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o2, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o3, USAGE=cal.usage.generic)

        cal.sample(0, 0, a, reg.v0.xy)
        cal.sample(1, 0, b, reg.v0.xy)
        cal.sample(2, 0, c, reg.v0.xy)
        cal.sample(3, 0, d, reg.v0.xy)

        for i in range(16):
            cal.sample(i + 4, 0, x[i], reg.v0.xy)

            # Round 1
        FF(a, b, c, d, x[0], S11, 0xd76aa478)
        # 1
        FF(d, a, b, c, x[1], S12, 0xe8c7b756)
        # 2
        FF(c, d, a, b, x[2], S13, 0x242070db)
        # 3
        FF(b, c, d, a, x[3], S14, 0xc1bdceee)
        # 4
        FF(a, b, c, d, x[4], S11, 0xf57c0faf)
        # 5
        FF(d, a, b, c, x[5], S12, 0x4787c62a)
        # 6
        FF(c, d, a, b, x[6], S13, 0xa8304613)
        # 7
        FF(b, c, d, a, x[7], S14, 0xfd469501)
        # 8
        FF(a, b, c, d, x[8], S11, 0x698098d8)
        # 9
        FF(d, a, b, c, x[9], S12, 0x8b44f7af)
        # 10
        FF(c, d, a, b, x[10], S13, 0xffff5bb1)
        # 11
        FF(b, c, d, a, x[11], S14, 0x895cd7be)
        # 12
        FF(a, b, c, d, x[12], S11, 0x6b901122)
        # 13
        FF(d, a, b, c, x[13], S12, 0xfd987193)
        # 14
        FF(c, d, a, b, x[14], S13, 0xa679438e)
        # 15
        FF(b, c, d, a, x[15], S14, 0x49b40821)
        # 16

        # Round 2
        GG(a, b, c, d, x[1], S21, 0xf61e2562)
        # 17
        GG(d, a, b, c, x[6], S22, 0xc040b340)
        # 18
        GG(c, d, a, b, x[11], S23, 0x265e5a51)
        # 19
        GG(b, c, d, a, x[0], S24, 0xe9b6c7aa)
        # 20
        GG(a, b, c, d, x[5], S21, 0xd62f105d)
        # 21
        GG(d, a, b, c, x[10], S22, 0x2441453)
        # 22
        GG(c, d, a, b, x[15], S23, 0xd8a1e681)
        # 23
        GG(b, c, d, a, x[4], S24, 0xe7d3fbc8)
        # 24
        GG(a, b, c, d, x[9], S21, 0x21e1cde6)
        # 25
        GG(d, a, b, c, x[14], S22, 0xc33707d6)
        # 26
        GG(c, d, a, b, x[3], S23, 0xf4d50d87)
        # 27
        GG(b, c, d, a, x[8], S24, 0x455a14ed)
        # 28
        GG(a, b, c, d, x[13], S21, 0xa9e3e905)
        # 29
        GG(d, a, b, c, x[2], S22, 0xfcefa3f8)
        # 30
        GG(c, d, a, b, x[7], S23, 0x676f02d9)
        # 31
        GG(b, c, d, a, x[12], S24, 0x8d2a4c8a)
        # 32

        # Round 3
        HH(a, b, c, d, x[5], S31, 0xfffa3942)
        # 33
        HH(d, a, b, c, x[8], S32, 0x8771f681)
        # 34
        HH(c, d, a, b, x[11], S33, 0x6d9d6122)
        # 35
        HH(b, c, d, a, x[14], S34, 0xfde5380c)
        # 36
        HH(a, b, c, d, x[1], S31, 0xa4beea44)
        # 37
        HH(d, a, b, c, x[4], S32, 0x4bdecfa9)
        # 38
        HH(c, d, a, b, x[7], S33, 0xf6bb4b60)
        # 39
        HH(b, c, d, a, x[10], S34, 0xbebfbc70)
        # 40
        HH(a, b, c, d, x[13], S31, 0x289b7ec6)
        # 41
        HH(d, a, b, c, x[0], S32, 0xeaa127fa)
        # 42
        HH(c, d, a, b, x[3], S33, 0xd4ef3085)
        # 43
        HH(b, c, d, a, x[6], S34, 0x4881d05)
        # 44
        HH(a, b, c, d, x[9], S31, 0xd9d4d039)
        # 45
        HH(d, a, b, c, x[12], S32, 0xe6db99e5)
        # 46
        HH(c, d, a, b, x[15], S33, 0x1fa27cf8)
        # 47
        HH(b, c, d, a, x[2], S34, 0xc4ac5665)
        # 48

        # Round 4
        II(a, b, c, d, x[0], S41, 0xf4292244)
        # 49
        II(d, a, b, c, x[7], S42, 0x432aff97)
        # 50
        II(c, d, a, b, x[14], S43, 0xab9423a7)
        # 51
        II(b, c, d, a, x[5], S44, 0xfc93a039)
        # 52
        II(a, b, c, d, x[12], S41, 0x655b59c3)
        # 53
        II(d, a, b, c, x[3], S42, 0x8f0ccc92)
        # 54
        II(c, d, a, b, x[10], S43, 0xffeff47d)
        # 55
        II(b, c, d, a, x[1], S44, 0x85845dd1)
        # 56
        II(a, b, c, d, x[8], S41, 0x6fa87e4f)
        # 57
        II(d, a, b, c, x[15], S42, 0xfe2ce6e0)
        # 58
        II(c, d, a, b, x[6], S43, 0xa3014314)
        # 59
        II(b, c, d, a, x[13], S44, 0x4e0811a1)
        # 60
        II(a, b, c, d, x[4], S41, 0xf7537e82)
        # 61
        II(d, a, b, c, x[11], S42, 0xbd3af235)
        # 62
        II(c, d, a, b, x[2], S43, 0x2ad7d2bb)
        # 63
        II(b, c, d, a, x[9], S44, 0xeb86d391)
        # 64

        cal.mov('o0', a)
        cal.mov('o1', b)
        cal.mov('o2', c)
        cal.mov('o3', d)

        xcode.release_register(a)
        xcode.release_register(b)
        xcode.release_register(c)
        xcode.release_register(d)
        for xi in x:
            xcode.release_register(xi)

    xcode.set_remote_binding('i0', input_statea)
    xcode.set_remote_binding('i1', input_stateb)
    xcode.set_remote_binding('i2', input_statec)
    xcode.set_remote_binding('i3', input_stated)
    for i in range(16):  #range(len(input_block)):
        xcode.set_remote_binding('i' + str(i + 4), input_block[i])
    xcode.set_remote_binding('o0', outputa)
    xcode.set_remote_binding('o1', outputb)
    xcode.set_remote_binding('o2', outputc)
    xcode.set_remote_binding('o3', outputd)

    domain = (0, 0, N, N)
    global TIME
    start_time = time.time()
    proc.execute(xcode, domain)
    end_time = time.time()
    TIME += (end_time - start_time)
    for j in range(N):
        for i in range(N):
            for k in range(4):
                parcontext.statea[k + (i + j * N) *
                                  4] += outputa[address_4_2d(i, j) + k]
                parcontext.stateb[k + (i + j * N) *
                                  4] += outputb[address_4_2d(i, j) + k]
                parcontext.statec[k + (i + j * N) *
                                  4] += outputc[address_4_2d(i, j) + k]
                parcontext.stated[k + (i + j * N) *
                                  4] += outputd[address_4_2d(i, j) + k]

    proc.free_remote(input_statea)
    proc.free_remote(input_stateb)
    proc.free_remote(input_statec)
    proc.free_remote(input_stated)
    for block in input_block:
        proc.free_remote(block)
    proc.free_remote(outputa)
    proc.free_remote(outputb)
    proc.free_remote(outputc)
    proc.free_remote(outputd)
Пример #37
0
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import array

import corepy.lib.extarray as extarray
import corepy.arch.ppc.isa as ppc
import corepy.arch.vmx.isa as vmx
import corepy.spre.spe as spe
import corepy.arch.ppc.lib.util as util
#from corepy.arch.ppc.types.ppc_types import make_user_type

from corepy.spre.syn_util import most_specific

_array_type = type(extarray.extarray('I', [1]))
INT_ARRAY_SIZES = {'b': 16, 'h': 8, 'i': 4, 'B': 16, 'H': 8, 'I': 4}


class VMXType(spe.Type):
    def __init__(self, *args, **kargs):
        super(VMXType, self).__init__(*args, **kargs)
        self.storage = None
        return

    def _get_active_code(self):
        return ppc.get_active_code()

    def _set_active_code(self, code):
        return ppc.set_active_code(code)
Пример #38
0
def TestMFC():
    size = 32
    #data_array = array.array('I', range(size))
    #data = synspu.aligned_memory(size, typecode = 'I')
    #data.copy_to(data_array.buffer_info()[0], len(data_array))
    data = extarray.extarray('I', range(size))
    code = synspu.InstructionStream()

    r_zero = code.acquire_register()
    r_ea_data = code.acquire_register()
    r_ls_data = code.acquire_register()
    r_size = code.acquire_register()
    r_tag = code.acquire_register()

    # Load zero
    util.load_word(code, r_zero, 0)

    print 'array ea: %X' % (data.buffer_info()[0])
    print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
        str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))

    # Load the effective address
    print 'test ea: %X' % data.buffer_info()[0]
    util.load_word(code, r_ea_data, data.buffer_info()[0])

    # Load the size
    code.add(spu.ai(r_size, r_zero, size * 4))

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 2))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

    # Set the tag bit to 2
    mfc_write_tag_mask(code, 1 << 2)

    # Wait for the transfer to complete
    mfc_read_tag_status_all(code)

    # Increment the data values by 1 using an unrolled loop (no branches)
    r_current = code.acquire_register()

    for lsa in range(0, size * 4, 16):
        code.add(spu.lqa(r_current, (lsa >> 2)))
        code.add(spu.ai(r_current, r_current, 1))
        code.add(spu.stqa(r_current, (lsa >> 2)))

    code.release_register(r_current)

    # Store the values back to main memory

    # Load the data into address 0
    mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

    # Set the tag bit to 2
    mfc_write_tag_mask(code, 1 << 2)

    # Wait for the transfer to complete
    mfc_read_tag_status_all(code)

    # Cleanup
    code.release_register(r_zero)
    code.release_register(r_ea_data)
    code.release_register(r_ls_data)
    code.release_register(r_size)
    code.release_register(r_tag)

    # Stop for debugging
    # code.add(spu.stop(0xA))

    # Execute the code
    proc = synspu.Processor()
    # code.print_code()
    #print data_array
    proc.execute(code)

    #data.copy_from(data_array.buffer_info()[0], len(data_array))

    for i in range(size):
        assert (data[i] == i + 1)

    return
Пример #39
0
def MemoryDescExample(data_size=20000):
    """
  This example uses a memory descriptor to move 20k integers back and 
  forth between main memory and the SPU local store. Each value is
  incremented by 1 while on the SPU.
  
  Memory descriptors are a general purpose method for describing a
  region of memory.  Memory is described by a typecode, address, and
  size.  Memory descriptors can be initialized by hand or from an
  array or buffer object.

  For main memory, memory descriptors are useful for transfering data
  between main memory and an SPU's local store.  The get/put methods
  on a memory descriptor generate the SPU code to move data of any
  size between main memory and local store.

  Memory descriptors can also be used with spu_vec_iters to describe
  the region of memory to iterate over.  The typecode in the memory
  descriptor is used to determine the type for the loop induction
  variable.

  Note that there is currently no difference between memory
  descriptors for main memory and local store.  It's up to the user to
  make sure the memory descriptor settings make sense in the current
  context.  (this will probably change in the near future)

  Note: get/put currently use loops rather than display lists for
        transferring data over 16k.
  """

    code = InstructionStream()
    proc = Processor()

    code.debug = True
    spu.set_active_code(code)

    # Create a python array
    data = extarray.extarray('I', range(data_size))

    # Align the data in the array
    #a_data = aligned_memory(data_size, typecode = 'I')
    #a_data.copy_to(data.buffer_info()[0], data_size)

    # Create memory descriptor for the data in main memory
    data_desc = memory_desc('I')
    #data_desc.from_array(a_data)
    data_desc.from_array(data)

    # Transfer the data to 0x0 in the local store
    data_desc.get(code, 0)

    # Create memory descriptor for the data in the local store for use
    # in the iterator
    lsa_data = memory_desc('i', 0, data_size)

    # Add one to each value
    for x in spu_vec_iter(code, lsa_data):
        x.v = x + 1

    # Transfer the data back to main memory
    data_desc.put(code, 0)

    dma.spu_write_out_mbox(code, 0xCAFE)

    # Execute the synthetic program
    # code.print_code()

    spe_id = proc.execute(code, async=True)
    proc.join(spe_id)

    # Copy it back to the Python array
    #a_data.copy_from(data.buffer_info()[0], data_size)

    for i in xrange(data_size):
        assert (data[i] == i + 1)
    return
Пример #40
0
def Test():
    code = env.InstructionStream()
    proc = env.Processor()
    params = env.ExecParams()
    params.p1 = 3
    mr32 = MemRef(rbp, 16, data_size=32)
    mr8 = MemRef(rbp, 16, data_size=8)

    lbl1 = code.get_label("lbl1")
    lbl2 = code.get_label("lbl2")

    code.add(x86.xor(rax, rax))

    code.add(x86.cmp(rax, 1))
    code.add(x86.jne(lbl1))

    code.add(x86.ud2())
    code.add(x86.ud2())

    code.add(lbl1)
    code.add(x86.cmp(rax, 1))
    code.add(x86.je(lbl2))
    code.add(x86.add(rax, 12))
    code.add(lbl2)

    # printer.PrintInstructionStream(code, printer.x86_64_Nasm(function_name="foobar"))
    ret = proc.execute(code)
    print "ret", ret
    assert (ret == 12)

    print "W00T"

    code.reset()

    code.add(x86.xor(rax, rax))

    code.add(x86.cmp(rax, 1))
    code.add(x86.jne(28))

    code.add(x86.ud2())
    code.add(x86.ud2())

    code.add(x86.cmp(eax, 1))
    code.add(x86.je(37))
    code.add(x86.add(rax, 12))

    code.print_code(hex=True, pro=True, epi=True)
    print "a"
    ret = proc.execute(code)
    print "b"
    print "ret", ret
    assert (ret == 12)

    print "w00t 2"

    code.reset()

    call_lbl = code.get_label("call_fn")

    code.add(x86.xor(rax, rax))
    code.add(x86.call(call_lbl))
    code.add(x86.jmp(code.lbl_epilogue))
    code.add(x86.mov(rax, 75))
    code.add(x86.mov(rax, 42))
    code.add(call_lbl)
    code.add(x86.mov(rax, 15))
    code.add(x86.ret())

    code.print_code()
    ret = proc.execute(code)
    print "ret", ret
    assert (ret == 15)

    code.reset()

    fwd_lbl = code.get_label("FORWARD")
    bck_lbl = code.get_label("BACKWARD")

    code.add(x86.xor(rax, rax))
    code.add(bck_lbl)
    code.add(x86.cmp(rax, 1))
    code.add(x86.jne(fwd_lbl))
    for i in xrange(0, 65):
        code.add(x86.pop(r15))
    code.add(fwd_lbl)

    ret = proc.execute(code, mode='int')
    assert (ret == 0)

    code.reset()

    loop_lbl = code.get_label("LOOP")
    out_lbl = code.get_label("OUT")
    skip_lbl = code.get_label("SKIP")

    code.add(x86.xor(rax, rax))
    code.add(loop_lbl)
    for i in range(0, 1):
        for i in xrange(0, 24):
            code.add(x86.add(r15, MemRef(rsp, 4)))

        code.add(x86.add(rax, 4))
        code.add(x86.cmp(rax, 20))
        code.add(x86.je(out_lbl))

        for i in xrange(0, 24):
            code.add(x86.add(r15, MemRef(rsp, 4)))

        code.add(x86.cmp(rax, 32))
        code.add(x86.jne(loop_lbl))

    code.add(out_lbl)

    code.add(x86.jmp(skip_lbl))
    for i in xrange(0, 2):
        code.add(x86.add(r15, MemRef(rsp, 4)))
    code.add(skip_lbl)

    ret = proc.execute(code, mode='int')
    print "ret", ret
    assert (ret == 20)

    code.reset()

    loop_lbl = code.get_label("LOOP")
    else_lbl = code.get_label("ELSE")
    finish_lbl = code.get_label("finish")

    code.add(x86.mov(rax, 0))
    code.add(x86.mov(rdx, 0))

    code.add(loop_lbl)

    code.add(x86.add(rax, 1))
    code.add(x86.cmp(rax, 16))
    code.add(x86.jge(finish_lbl))

    code.add(x86.add(rdx, rax))
    code.add(x86.mov(r8, rdx))
    code.add(x86.and_(r8, 0x1))
    code.add(x86.jnz(else_lbl))

    code.add(x86.add(rdx, 1))
    code.add(x86.jmp(loop_lbl))

    code.add(else_lbl)
    code.add(x86.add(rdx, r8))
    code.add(x86.jmp(loop_lbl))

    code.add(finish_lbl)
    code.add(x86.mov(rax, rdx))

    ret = proc.execute(code, mode='int')
    print "ret", ret
    assert (ret == 135)

    code.reset()

    loop_lbl = code.get_label("LOOP")

    code.add(x86.xor(rax, rax))
    code.add(x86.xor(rcx, rcx))
    code.add(x86.mov(rdx, 1))

    code.add(loop_lbl)
    code.add(x86.inc(rax))
    code.add(x86.cmp(rax, 7))
    code.add(x86.cmove(rcx, rdx))
    code.add(x86.jrcxz(loop_lbl))

    code.print_code(hex=True)
    ret = proc.execute(code, mode='int')
    print "ret", ret
    assert (ret == 7)

    code.reset()

    code.add(x86.mov(rax, MemRef(rbp, 16)))
    code.add(x86.xor(rbx, rbx))
    code.add(x86.mov(rbx, -1))
    code.add(x86.mov(cl, 1))
    code.add(x86.shld(rax, rbx, cl))
    code.print_code(hex=True)
    ret = proc.execute(code, params=params, mode='int')
    print "ret", ret
    assert (ret == 7)

    # code.reset()

    # code.add(x86.add(eax, 200))
    # code.add(x86.xor(eax, eax))
    # code.add(x86.add(al, 32))
    # code.add(x86.add(bl, 32))
    # code.add(x86.xor(bl, bl))
    # code.add(x86.mov(mr8, al))
    # code.add(x86.add(mr32, 0))
    # code.add(x86.mov(eax, mr32))
    # code.add(x86.mov(al, mr8))
    #
    # code.add(x86.imul(ax, ax, 4))
    # code.add(x86.imul(eax, ebx, 10))
    # code.add(x86.mov(cx, 1232))
    # code.add(x86.sub(ax, cx))
    # code.add(x86.xor(eax,eax))
    # code.add(x86.mov(eax,ebx))
    # code.add(x86.clc())
    # code.add(x86.rcl(eax, 1))
    # code.add(x86.rcr(eax, 1))

    # #ret = proc.execute(code, debug = True, params = params)
    # id1 = proc.execute(code, params = params, mode = 'int', async = True)
    # id2 = proc.execute(code, params = params, mode = 'int', async = True)
    # ret = proc.execute(code, params = params, mode = 'int')
    # print "Return main thread: %d" % (ret)
    # assert(ret == 1280)
    # ret = proc.join(id1)
    # print "Return thread 1: %d" % (ret)
    # assert(ret == 1280)
    # ret = proc.join(id2)
    # print "Return thread 2: %d" % (ret)
    # assert(ret == 1280)

    code.reset()

    code.add(x86.fldpi())
    code.add(x86.pxor(xmm0, xmm0))
    code.add(x86.fld1())
    code.add(x86.fadd(st0, st0))
    code.add(x86.fmulp())
    code.add(x86.fsin())
    code.add(x86.fcos())
    code.add(x86.fld1())
    code.add(x86.fyl2xp1())

    # x86_64 now uses xmm0 to return floats, not st0.  So here, just make room
    # on the stack, convert the FP result to an int and store it on the stack,
    # then pop it into rax, the int return register.
    code.add(x86.push(rax))
    code.add(x86.fistp(MemRef(rsp)))
    code.add(x86.pop(rax))

    code.print_code(hex=True)
    ret = proc.execute(code, params=params, mode='int')
    assert (ret == 1)
    print "Return main thread: %d" % (ret)

    code.reset()

    lbl_ok = code.get_label("OK")
    code.add(x86.emms())
    code.add(x86.movd(xmm0, mr32))
    code.add(x86.mov(ebx, mr32))

    code.add(x86.cmp(ebx, 3))
    code.add(x86.je(lbl_ok))
    code.add(x86.movd(eax, xmm0))
    code.add(x86.cmp(eax, 3))
    code.add(x86.je(lbl_ok))
    code.add(x86.ud2())

    code.add(lbl_ok)
    code.add(x86.xor(eax, eax))
    code.add(x86.movd(xmm1, ebx))
    code.add(x86.paddq(xmm0, xmm1))
    code.add(x86.pextrw(ecx, xmm0, 0))
    code.add(x86.pinsrw(mm1, ecx, 0))
    code.add(x86.movq2dq(xmm0, mm1))
    code.add(x86.movdq2q(mm2, xmm0))
    code.add(x86.movd(edx, mm2))
    code.add(x86.movd(xmm5, edx))
    code.add(x86.movd(ecx, xmm5))
    code.add(x86.pinsrw(xmm6, ecx, 0))
    code.add(x86.movd(eax, xmm6))

    code.print_code(hex=True)
    ret = proc.execute(code, params=params, mode='int')
    print "Return main thread: %d" % (ret)
    assert (ret == 6)

    code.reset()

    # Test immediate size encodings
    code.add(x86.add(eax, 300))
    code.add(x86.add(ax, 300))
    code.add(x86.add(ax, 30))
    code.add(x86.mov(eax, 16))
    code.add(x86.mov(eax, 300))

    code.reset()
    code.add(x86.add(eax, 0xDEADBEEF))
    code.add(x86.add(ebx, 0xDEADBEEF))
    code.print_code(hex=True)

    # Try the LOCK prefix
    code.reset()
    code.add(x86.xor(eax, eax))
    code.add(x86.add(mr32, eax))
    code.add(x86.add(mr32, eax, lock=True))
    #code.print_code(hex = True)

    proc.execute(code, params=params)

    code.reset()

    code.add(x86.mov(edx, 0x1234))
    code.add(x86.mov(eax, 0xFFFF))
    code.add(x86.xchg(edx, eax))

    code.print_code(hex=True)
    ret = proc.execute(code, params=params)
    print "ret:", ret
    assert (ret == 0x1234)

    code.reset()

    code.add(x86.mov(rax, rsp))
    code.add(x86.pushfq())
    code.add(x86.sub(rax, rsp))
    code.add(x86.add(rsp, rax))

    code.print_code(hex=True)
    ret = proc.execute(code, params=params)
    print "ret:", ret
    assert (ret == 8)

    code.reset()

    data = extarray.extarray('H', xrange(0, 16))

    # code.add(x86.push(rdi))
    code.add(x86.mov(rdi, data.buffer_info()[0]))
    code.add(x86.movaps(xmm1, MemRef(rdi, data_size=128)))
    code.add(x86.pextrw(rax, xmm1, 0))
    code.add(x86.pextrw(rbx, xmm1, 1))
    code.add(x86.pextrw(rcx, xmm1, 2))
    code.add(x86.pextrw(rdx, xmm1, 3))
    code.add(x86.shl(rbx, 16))
    code.add(x86.shl(rcx, 32))
    code.add(x86.shl(rdx, 48))
    code.add(x86.or_(rax, rbx))
    code.add(x86.or_(rax, rcx))
    code.add(x86.or_(rax, rdx))
    # code.add(x86.pop(rdi))

    code.print_code(hex=True)
    ret = proc.execute(code, mode='int')
    print "ret %x" % ret
    assert (ret == 0x0003000200010000)

    code.reset()
    L1 = code.get_label("L1")
    code.add(x86.xor(rax, rax))
    code.add(x86.mov(rcx, 3))
    code.add(L1)
    code.add(x86.add(rax, 1))
    code.add(x86.loop(L1))

    code.print_code(hex=True)
    ret = proc.execute(code, mode='int')
    print "ret %x" % ret
    assert (ret == 0x03)

    return
Пример #41
0
def TestSPUIter():
    size = 32
    data = extarray.extarray('I', range(size))
    code = env.InstructionStream()

    r_zero = code.acquire_register()
    r_ea_data = code.acquire_register()
    r_ls_data = code.acquire_register()
    r_size = code.acquire_register()
    r_tag = code.acquire_register()

    # Load zero
    util.load_word(code, r_zero, 0)

    #print 'array ea: %X' % (data.buffer_info()[0])
    #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
    #  str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))

    # Load the effective address
    util.load_word(code, r_ea_data, data.buffer_info()[0])

    # Load the size
    util.load_word(code, r_size, size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1 << 12)

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code)

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    # Use an SPU iter
    for lsa in syn_iter(code, size * 4, 16):
        code.add(spu.lqx(current, r_zero, lsa))
        # code.add(spu.ai(1, r_current, r_current))
        current.v = current + current
        code.add(spu.stqx(current, r_zero, lsa))

    # code.release_register(r_current)
    #current.release_register(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1 << 13)

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code)

    # Cleanup
    code.release_register(r_zero)
    code.release_register(r_ea_data)
    code.release_register(r_ls_data)
    code.release_register(r_size)
    code.release_register(r_tag)

    # Stop for debugging
    # code.add(spu.stop(0xA))

    # Execute the code
    proc = env.Processor()
    r = proc.execute(code)

    for i in range(0, size):
        assert (data[i] == i + i)

    return
Пример #42
0
  def alloc_host(self, typecode, length, comps = 1):
    """
    Allocate local GPU memory and return a handle for copying/binding.

    Typecode is ptx typecode (u32, s32, f32, u64, etc.)
    """
    #fmt = self._get_fmt(typecode, comps)


    array_typecode = ''
    
    # This might be clearer, but not very efficient...
    #type_conversion_table = {}
    #type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'}
    #type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'}
    #type_conversion_table['16'] = {'u': 'H', 's', 'h'}
    #type_conversion_table['8'] = {'u': 'B', 's', 'b'}
    #
    #if typecode == 'b':
    #  typecode = 'u'
    #array_typecode = type_conversion_table[typecode[0]][typecode[1:]]
    
    scalar_width = int(typecode[1:])
    if typecode[0] == 'f':
      if scalar_width == 32:
        array_typecode = 'f'
      elif scalar_width == 64:
        array_typecode = 'd'
    elif typecode[0] == 'u':
      if scalar_width == 32:
        array_typecode = 'I'
      elif scalar_width == 64:
        array_typecode = 'L'
      elif scalar_width == 16:
        array_typecode = 'H'
      elif scalar_width == 8:
        array_typecode = 'b'
    elif typecode[0] == 's':
      if scalar_width == 32:
        array_typecode = 'i'
      elif scalar_width == 64:
        array_typecode = 'l'
      elif scalar_width == 16:
        array_typecode = 'h'
      elif scalar_width == 8:
        array_typecode = 'B'

    if array_typecode == '':
      raise Exception('Unable to convert type')
          
    mem = ptx_exec.alloc_host(length*scalar_byte_width*comps)
    
    arr = extarray.extarray(array_typecode, 0)
    arr.data_len = scalar_width/4 * length * comps
    arr.set_memory(mem, arr.data_len * 4)
    arr.gpu_mem_handle = mem
#    arr.gpu_device = self.device
    arr.gpu_width = length
#     arr.gpu_pitch = mem[2]
#     arr.gpu_height = height
    return arr
Пример #43
0
def TestParams():
  import time
  #SIZE = 1024
  kernel = (
  '''
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  '''
  )

  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  a = 1.0
  b = 2.0

  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  #mem.set_memory(ptx_mem_addr, 4)
  mem[0] = 5.0

  print ptx_mem_addr, type(ptx_mem_addr)
  print mem.buffer_info()[0], type(mem.buffer_info()[0])
  param_list = [ptx_mem_addr, a, b]
  # image, dev num, (x, y, w, h)

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  t2 = time.time()
  print "run time", t2 - t1
  print "X", mem.buffer_info()[0], ptx_mem_addr
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem

  #ptx_exec.free(input)
  #ptx_exec.free(output)
  ##ptx_exec.free(glob)
  #ptx_exec.unload_module(image)
  return
Пример #44
0
    def cache_code(self):
        """
    Fill in the epilogue and prologue.  This call freezes the code and
    any subsequent calls to acquire_register() or add() will unfreeze
    it.  Also perform alignment checks.  Once the checks are
    preformed, the code should not be modified.
    """

        if self._cached == True:
            return

        # HACK: Disable the current active code
        # NOTE: This may not work in the presence of multiple ISAs...
        active_callback = None
        if self._active_callback is not None:
            active_callback = self._active_callback
            active_callback(None)

        self._synthesize_prologue()
        self._synthesize_epilogue()

        render_code = extarray.extarray(self.instruction_type)

        # Note - TRAC ticket #19 has some background info and reference links on
        # the algorithms used here. https://svn.osl.iu.edu/trac/corepy/ticket/19

        if self.instruction_type == 'I':
            fwd_ref_list = []

            # Assumed below that 'I' type is 4 bytes
            for arr in (self._prologue, self._instructions, self._epilogue):
                for val in arr:
                    if isinstance(val, (Instruction, ExtendedInstruction)):
                        # Does this instruction reference any labels?
                        lbl = None
                        for k in val._operands.keys():
                            if isinstance(val._operands[k], Label):
                                lbl = val._operands[k]
                                break

                        if lbl == None:  # No label reference, render the inst
                            render_code.append(val.render())
                        else:  # Label reference
                            assert (lbl.code == self)
                            val.set_position(len(render_code) * 4)

                            if lbl.position != None:  # Back reference, render the inst
                                render_code.append(val.render())
                            else:  # Fill in a dummy instruction and save info to render later
                                fwd_ref_list.append((val, len(render_code)))
                                render_code.append(0xFFFFFFFF)
                    elif isinstance(
                            val, Label):  # Label, fill in a zero-length slot
                        val.set_position(len(render_code) * 4)

            # Render the instructions with forward label references
            for rec in fwd_ref_list:
                render_code[rec[1]] = rec[0].render()

        elif self.instruction_type == 'B':
            # inst_list is a list of tuples.  Each tuple contains a bool
            # indicating presence of a label reference, rendered code ([] if label),
            # and a label or instruction object.
            inst_list = []
            inst_len = 0

            for arr in (self._prologue, self._instructions, self._epilogue):
                for val in arr:
                    if isinstance(val, (Instruction, ExtendedInstruction)):
                        # Does this instruction reference any labels?
                        lbl = None
                        relref = False
                        #iop = 0
                        #for k in val._operands.keys():
                        sig = val.machine_inst.signature
                        #while val._operands.has_key(iop):
                        for iop in xrange(0, len(sig)):
                            opsig = sig[iop]
                            #if isinstance(op, (int, long)):
                            #  print "ops", val._operands
                            #  print "op", op, iop, val.params, val.machine_inst.signature
                            #  print "opsig", opsig
                            if hasattr(opsig, "relative_op"
                                       ) and opsig.relative_op == True:
                                op = val._operands[iop]
                                if isinstance(op, Label):
                                    lbl = op
                                # This is a hack, but it works.  Some instructions can have
                                # a relative offset that is not a label.  These insts need to be
                                # re-rendered if instruction sizes change
                                relref = True
                            #iop += 1

                        if lbl == None:  # No label references
                            val.set_position(inst_len)
                            r = val.render()
                            inst_list.append([relref, r, val])
                            inst_len += len(r)
                        else:  # Instruction referencing a label.
                            assert (lbl.code == self)
                            val.set_position(inst_len)

                            if lbl.position != None:  # Back-reference, render the instruction
                                r = val.render()
                                inst_list.append([True, r, val])
                                inst_len += len(r)
                            else:  # Fill in a dummy instruction, assuming 2-byte best case
                                inst_list.append([True, [-1, -1], val])
                                inst_len += 2
                    elif isinstance(
                            val, Label):  # Label, fill in a zero-length slot
                        val.set_position(inst_len)
                        inst_list.append([False, [], val])

            inst_list = self._adjust_pass(inst_list)

            # Final loop, bring everything together into render_code
            for rec in inst_list:
                if isinstance(rec[2], (Instruction, ExtendedInstruction)):
                    render_code.fromlist(rec[1])

        self.render_code = render_code
        self.make_executable()

        if active_callback is not None:
            active_callback(self)

        self._cached = True
        return
Пример #45
0
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import corepy.lib.extarray as extarray
import corepy.arch.spu.isa as spu
import corepy.arch.spu.platform as env
import corepy.arch.spu.lib.dma as dma
from corepy.arch.spu.lib.util import load_word

if __name__ == '__main__':
    a = extarray.extarray('i', range(0, 32))
    b = extarray.extarray('i', [0 for i in range(0, 32)])
    code = env.InstructionStream()
    proc = env.Processor()

    spu.set_active_code(code)

    abi = a.buffer_info()
    print "abi", abi, a.itemsize
    dma.mem_get(code, 0x1000, abi[0], abi[1] * a.itemsize, 2)
    dma.mem_complete(code, 2)

    bbi = b.buffer_info()
    print "bbi", bbi, b.itemsize
    dma.mem_put(code, 0x1000, bbi[0], bbi[1] * b.itemsize, 2)
    dma.mem_complete(code, 2)
Пример #46
0
def Test():
    prgm = env.Program()
    code = prgm.get_stream()
    proc = env.Processor()
    params = env.ExecParams()
    params.p1 = 3

    lbl1 = prgm.get_label("lbl1")
    lbl2 = prgm.get_label("lbl2")

    code.add(x86.xor(prgm.gp_return, prgm.gp_return))

    code.add(x86.cmp(prgm.gp_return, 1))
    code.add(x86.jne(lbl1))

    code.add(x86.ud2())
    code.add(x86.ud2())

    code.add(lbl1)
    code.add(x86.cmp(prgm.gp_return, 1))
    code.add(x86.je(lbl2))
    code.add(x86.add(prgm.gp_return, 12))
    code.add(lbl2)

    prgm.add(code)
    #prgm.print_code(pro = True, epi = True, hex = True)
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 12)

    prgm.reset()
    code.reset()

    code.add(x86.xor(prgm.gp_return, prgm.gp_return))

    code.add(x86.cmp(prgm.gp_return, 1))
    code.add(x86.jne(28))

    code.add(x86.ud2())
    code.add(x86.ud2())

    code.add(x86.cmp(prgm.gp_return, 1))
    code.add(x86.je(37))
    code.add(x86.add(prgm.gp_return, 12))

    prgm.add(code)
    prgm.print_code(hex=True, pro=True, epi=True)
    ret = proc.execute(prgm)
    print "ret", ret
    assert (ret == 12)

    prgm.reset()
    code.reset()

    call_lbl = prgm.get_label("call_fn")

    code.add(x86.xor(prgm.gp_return, prgm.gp_return))
    code.add(x86.call(call_lbl))
    code.add(x86.jmp(prgm.lbl_epilogue))
    code.add(x86.mov(prgm.gp_return, 75))
    code.add(x86.mov(prgm.gp_return, 42))
    code.add(call_lbl)
    code.add(x86.mov(prgm.gp_return, 15))
    code.add(x86.ret())

    prgm.add(code)
    prgm.print_code()
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 15)

    prgm.reset()
    code.reset()

    fwd_lbl = prgm.get_label("FORWARD")
    bck_lbl = prgm.get_label("BACKWARD")

    code.add(x86.xor(prgm.gp_return, prgm.gp_return))
    code.add(bck_lbl)
    code.add(x86.cmp(prgm.gp_return, 1))
    code.add(x86.jne(fwd_lbl))
    r_foo = prgm.acquire_register()
    for i in xrange(0, 65):
        code.add(x86.pop(r_foo))
    prgm.release_register(r_foo)
    code.add(fwd_lbl)

    prgm.add(code)
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 0)

    prgm.reset()
    code.reset()

    loop_lbl = prgm.get_label("LOOP")
    out_lbl = prgm.get_label("OUT")
    skip_lbl = prgm.get_label("SKIP")

    code.add(x86.xor(prgm.gp_return, prgm.gp_return))
    code.add(loop_lbl)
    r_foo = prgm.acquire_register()
    for i in range(0, 1):
        for i in xrange(0, 24):
            code.add(x86.add(r_foo, MemRef(rsp, 4)))

        code.add(x86.add(prgm.gp_return, 4))
        code.add(x86.cmp(prgm.gp_return, 20))
        code.add(x86.je(out_lbl))

        for i in xrange(0, 24):
            code.add(x86.add(r_foo, MemRef(rsp, 4)))

        code.add(x86.cmp(prgm.gp_return, 32))
        code.add(x86.jne(loop_lbl))

    code.add(out_lbl)

    code.add(x86.jmp(skip_lbl))
    for i in xrange(0, 2):
        code.add(x86.add(r_foo, MemRef(rsp, 4)))
    code.add(skip_lbl)

    prgm.release_register(r_foo)
    prgm.add(code)
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 20)

    prgm.reset()
    code.reset()

    r_tmp = prgm.acquire_registers(2)

    loop_lbl = prgm.get_label("LOOP")
    else_lbl = prgm.get_label("ELSE")
    finish_lbl = prgm.get_label("finish")

    code.add(x86.mov(prgm.gp_return, 0))
    code.add(x86.mov(r_tmp[0], 0))

    code.add(loop_lbl)

    code.add(x86.add(prgm.gp_return, 1))
    code.add(x86.cmp(prgm.gp_return, 16))
    code.add(x86.jge(finish_lbl))

    code.add(x86.add(r_tmp[0], prgm.gp_return))
    code.add(x86.mov(r_tmp[1], r_tmp[0]))
    code.add(x86.and_(r_tmp[1], 0x1))
    code.add(x86.jnz(else_lbl))

    code.add(x86.add(r_tmp[0], 1))
    code.add(x86.jmp(loop_lbl))

    code.add(else_lbl)
    code.add(x86.add(r_tmp[0], r_tmp[1]))
    code.add(x86.jmp(loop_lbl))

    code.add(finish_lbl)
    code.add(x86.mov(prgm.gp_return, r_tmp[0]))

    prgm.release_registers(r_tmp)

    prgm.add(code)
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 135)

    prgm.reset()
    code.reset()

    loop_lbl = prgm.get_label("LOOP")

    r_one = prgm.acquire_register()
    code.add(x86.xor(prgm.gp_return, prgm.gp_return))
    code.add(x86.xor(rcx, rcx))
    code.add(x86.mov(r_one, 1))

    code.add(loop_lbl)
    code.add(x86.inc(prgm.gp_return))
    code.add(x86.cmp(prgm.gp_return, 7))
    code.add(x86.cmove(rcx, r_one))
    code.add(x86.jrcxz(loop_lbl))

    prgm.release_register(r_one)

    prgm.add(code)
    prgm.print_code(hex=True)
    ret = proc.execute(prgm, mode='int')
    print "ret", ret
    assert (ret == 7)

    prgm.reset()
    code.reset()

    r_tmp = prgm.acquire_register()
    code.add(x86.mov(prgm.gp_return, rdi))
    code.add(x86.xor(r_tmp, r_tmp))
    code.add(x86.mov(r_tmp, -1))
    code.add(x86.mov(cl, 1))
    code.add(x86.shld(prgm.gp_return, r_tmp, cl))

    prgm.release_register(r_tmp)
    prgm.add(code)
    ret = proc.execute(prgm, params=params, mode='int')
    print "ret", ret
    assert (ret == 7)

    prgm.reset()
    code.reset()

    code.add(x86.add(eax, 200))
    code.add(x86.xor(eax, eax))
    code.add(x86.add(al, 32))
    code.add(x86.add(bl, 32))
    code.add(x86.xor(bl, bl))
    code.add(x86.mov(dil, al))
    code.add(x86.add(rdi, 0))
    code.add(x86.mov(eax, edi))
    code.add(x86.mov(al, dil))

    code.add(x86.imul(ax, ax, 4))
    code.add(x86.imul(eax, ebx, 10))
    code.add(x86.mov(cx, 1232))
    code.add(x86.sub(ax, cx))
    code.add(x86.xor(eax, eax))
    code.add(x86.mov(eax, ebx))
    code.add(x86.clc())
    code.add(x86.rcl(eax, 1))
    code.add(x86.rcr(eax, 1))

    prgm.add(code)
    #ret = proc.execute(prgm, debug = True, params = params)
    id1 = proc.execute(prgm, params=params, mode='int', async=True)
    id2 = proc.execute(prgm, params=params, mode='int', async=True)
    ret = proc.execute(prgm, params=params, mode='int')
    print "Return main thread: %d" % (ret)
    assert (ret == 1280)
    ret = proc.join(id1)
    print "Return thread 1: %d" % (ret)
    assert (ret == 1280)
    ret = proc.join(id2)
    print "Return thread 2: %d" % (ret)
    assert (ret == 1280)

    prgm.reset()
    code.reset()

    code.add(x86.fldpi())
    code.add(x86.pxor(xmm0, xmm0))
    code.add(x86.fld1())
    code.add(x86.fadd(st0, st0))
    code.add(x86.fmulp())
    code.add(x86.fsin())
    code.add(x86.fcos())
    code.add(x86.fld1())
    code.add(x86.fyl2xp1())

    # x86_64 now uses xmm0 to return floats, not st0.  So here, just make room
    # on the stack, convert the FP result to an int and store it on the stack,
    # then pop it into rax, the int return register.
    code.add(x86.push(prgm.gp_return))
    code.add(x86.fistp(MemRef(rsp)))
    code.add(x86.pop(prgm.gp_return))

    prgm.add(code)
    prgm.print_code(hex=True)
    ret = proc.execute(prgm, params=params, mode='int')
    assert (ret == 1)
    print "Return main thread: %d" % (ret)

    prgm.reset()
    code.reset()

    lbl_ok = prgm.get_label("OK")
    code.add(x86.emms())
    code.add(x86.movd(xmm0, edi))
    code.add(x86.mov(ebx, edi))

    code.add(x86.cmp(ebx, 3))
    code.add(x86.je(lbl_ok))
    code.add(x86.movd(eax, xmm0))
    code.add(x86.cmp(eax, 3))
    code.add(x86.je(lbl_ok))
    code.add(x86.ud2())

    code.add(lbl_ok)
    code.add(x86.xor(eax, eax))
    code.add(x86.movd(xmm1, ebx))
    code.add(x86.paddq(xmm0, xmm1))
    code.add(x86.pextrw(ecx, xmm0, 0))
    code.add(x86.pxor(mm1, mm1))
    code.add(x86.pinsrw(mm1, ecx, 0))
    code.add(x86.movq2dq(xmm0, mm1))
    code.add(x86.movdq2q(mm2, xmm0))
    code.add(x86.movd(edx, mm2))
    code.add(x86.movd(xmm5, edx))
    code.add(x86.movd(ecx, xmm5))
    code.add(x86.pxor(xmm6, xmm6))
    code.add(x86.pinsrw(xmm6, ecx, 0))
    code.add(x86.movd(eax, xmm6))

    prgm.add(code)
    prgm.print_code(hex=True)
    ret = proc.execute(prgm, params=params, mode='int')
    print "Return main thread: %d" % (ret)
    assert (ret == 6)

    prgm.reset()
    code.reset()

    code.add(x86.mov(edx, 0x1234))
    code.add(x86.mov(eax, 0xFFFF))
    code.add(x86.xchg(edx, eax))

    prgm.add(code)
    prgm.print_code(hex=True)
    ret = proc.execute(prgm, params=params)
    print "ret:", ret
    assert (ret == 0x1234)

    prgm.reset()
    code.reset()

    code.add(x86.mov(prgm.gp_return, rsp))
    code.add(x86.pushfq())
    code.add(x86.sub(prgm.gp_return, rsp))
    code.add(x86.add(rsp, prgm.gp_return))

    prgm.add(code)
    prgm.print_code(hex=True)
    ret = proc.execute(prgm, params=params)
    print "ret:", ret
    assert (ret == 8)

    prgm.reset()
    code.reset()

    data = extarray.extarray('H', xrange(0, 16))

    r_128 = prgm.acquire_register(reg_type=XMMRegister)
    regs = prgm.acquire_registers(4)

    code.add(x86.mov(regs[0], data.buffer_info()[0]))
    code.add(x86.movaps(r_128, MemRef(regs[0], data_size=128)))
    code.add(x86.pextrw(prgm.gp_return, r_128, 0))
    code.add(x86.pextrw(regs[1], r_128, 1))
    code.add(x86.pextrw(regs[2], r_128, 2))
    code.add(x86.pextrw(regs[3], r_128, 3))
    code.add(x86.shl(regs[1], 16))
    code.add(x86.shl(regs[2], 32))
    code.add(x86.shl(regs[3], 48))
    code.add(x86.or_(prgm.gp_return, regs[1]))
    code.add(x86.or_(prgm.gp_return, regs[2]))
    code.add(x86.or_(prgm.gp_return, regs[3]))

    prgm.release_register(r_128)
    prgm.release_registers(regs)

    prgm.add(code)
    prgm.print_code()
    ret = proc.execute(prgm, mode='int')
    print "ret %x" % ret
    assert (ret == 0x0003000200010000)

    prgm.reset()
    code.reset()

    util.load_float(code, xmm0, 3.14159)

    prgm.add(code)
    ret = proc.execute(prgm, mode='fp')
    print "ret", ret
    assert (ret - 3.14159 < 0.00001)

    return
Пример #47
0
 def __init__(self):
     self.state = extarray.extarray('I', 4)
     self.count = extarray.extarray('I', 2)
     self.buffer = extarray.extarray('B', 64)
Пример #48
0
def TestSimpleKernel():
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs
  import time

  SIZE = 128

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

#  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
#  r1 = regs.ptxVariable('reg', 'f32', 'f1')
#  r2 = regs.ptxVariable('reg', 'f32', 'f2')
#  r3 = regs.ptxVariable('reg', 'f32', 'f3')
#  r4 = regs.ptxVariable('reg', 'f32', 'f4')
#  code.add('  .reg .u64 rd1;')
#  code.add('  .reg .f32 f1;')
#  code.add('  .reg .f32 f2;')
#  code.add('  .reg .f32 f3;')
#  code.add('  .reg .f32 f4;')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

#  import pdb
#  pdb.set_trace()
  #code.add(isa.add(r3, r2, r1))
  #code.add('add.f32 r3, r2, r1;')
  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  #temp = prgm.acquire_register('u32')
  #code.add(isa.cvt(temp, regs.tid.x))
  #code.add(isa.cvt(r4, temp, rnd='rn'))
  temp1 = prgm.acquire_register('u32')
  temp2 = prgm.acquire_register('u32')
  temp3 = prgm.acquire_register('u32')
  code.add(isa.mul(temp2, temp1, temp3, hlw='lo'))
  
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()
#   prgm.render_string = (
#   '''
#   .version 1.4
#   .target sm_10, map_f64_to_f32
#   .entry _main (
#   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
#   {
#   .reg .u64 %rd<3>;
#   .reg .f32 %f<6>;
#   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
#   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
#   add.f32 %f3, %f1, %f2;
  
#   mov.f32         %f4, %f3;
#   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
#   st.global.f32   [%rd1+0], %f4;

#   exit;
#   } // _Z16addArrayOnDevicePfff
#   '''
#   )
#   prgm.render_code = ptx_exec.compile(prgm.render_string)

  ####
  #ptx_mem_addr = proc.alloc_device('f32', 1)
  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  a = 1.0
  b = 2.0
  
  print mem.buffer_info()[0]
  param_list = [ptx_mem_addr, a, b]
  print map(type, param_list)
  #   # image, dev num, (x, y, w, h)

  #import pdb

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  #kernel = prgm.render_string
  #module = ptx_exec.compile(kernel)
  t1 = time.time()
  #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  proc.execute(prgm, (1,1,1,1,1), param_list)
  t2 = time.time()
#  pdb.set_trace()
  print "run time", t2 - t1

  print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
  print int(ptx_mem_addr)
  print int(mem.buffer_info()[0])
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem
  ####

  return
Пример #49
0
    def generate(self,
                 results,
                 patterns,
                 r1_range,
                 r2_range,
                 max_init,
                 max_n,
                 size,
                 n_spus=6):
        # Connect to the framebuffer
        #fb = cell_fb.framebuffer()
        #cell_fb.fb_open(fb)
        buffer = extarray.extarray('B', size[0] * size[1] * 4)
        buffer.clear()

        # Setup the range parameter array
        r1_inc = (r1_range[1] - r1_range[0]) / size[0]
        r2_inc = (r2_range[1] - r2_range[0]) / size[1]

        ranges = [0 for i in range(n_spus)]
        #a_ranges = [0 for i in range(n_spus)]

        # Slice and dice for parallel execution
        spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)]
        spu_slices[-1][1] += size[1] % n_spus

        offset = 0.0
        for ispu in range(n_spus):
            ranges[ispu] = extarray.extarray('f', [0.0] * 16)

            for i in range(4):
                ranges[ispu][
                    i] = r1_range[0] + float(i) * r1_inc  # horizontal is simd
                ranges[ispu][4 + i] = r2_range[0] + offset
                ranges[ispu][8 + i] = r1_inc * 4.0
                ranges[ispu][12 + i] = r2_inc
            # print ranges

            # Copy the paramters to aligned buffers
            #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I')
            #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu]))

            offset += r2_inc * spu_slices[ispu][1]

        # Setup the pattern vector
        for pattern in patterns:
            if len(pattern) != len(patterns[0]):
                raise Exception('All patterns must be the same length')

        bits = [_pattern2vector(pattern) for pattern in patterns]
        #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I')
        pattern = extarray.extarray('I', len(bits[0]))

        # Create the instruction streams
        codes = []

        n = len(patterns) * 10
        offset = 0
        for ispu in range(n_spus):
            renderer = FBRenderer()
            renderer.set_lsa(0x100)
            #renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset)
            renderer.set_addr(buffer.buffer_info()[0] + offset)
            renderer.set_width(size[0])
            #renderer.set_stride(fb.stride)
            renderer.set_stride(size[0])

            ly_block = LyapunovBlock()

            ly_block.set_size(*spu_slices[i])
            #ly_block.set_range(a_ranges[ispu])
            ly_block.set_range(ranges[ispu])
            #ly_block.set_pattern(a_pattern)
            ly_block.set_pattern(pattern)
            ly_block.set_max_init(max_init)
            ly_block.set_max_n(max_n)
            ly_block.set_renderer(renderer)

            code = synspu.InstructionStream()
            # code.set_debug(True)
            codes.append(code)
            #offset += spu_slices[i][1] * fb.stride * 4
            offset += spu_slices[i][1] * size[0] * 4

            # for i in spuiter.syn_range(code, n):
            ly_block.synthesize(code)

        # code.print_code()
        proc = synspu.Processor()

        #cell_fb.fb_clear(fb, 0)
        buffer.clear()

        import time
        ids = [0 for i in range(n_spus)]
        start = time.time()

        ipattern = 0
        n_patterns = len(patterns)
        len_bits = len(bits[0])
        pattern_inc = 1

        for i in range(n):
            #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits)
            # TODO - better/faster
            for j in xrange(0, len_bits):
                pattern[j] = bits[ipattern][j]

            for ispu in range(n_spus):
                ids[ispu] = proc.execute(codes[ispu], async=True)

            for ispu in range(n_spus):
                proc.join(ids[ispu])

            #cell_fb.fb_wait_vsync(fb)
            #cell_fb.fb_flip(fb, 0)
            # TODO - write buffer to image file
            #im = Image.frombuffer("RGBA", size, buffer.tostring(), "raw", "RGBA", 0, 1)
            imgbuf = Image.new("RGBA", size)

            arr = [(buffer[i + 3], buffer[i + 2], buffer[i + 1], 0xFF)
                   for i in xrange(0, len(buffer), 4)]
            imgbuf.putdata(arr)
            imgbuf.save("lyapunov_%d.png" % ipattern)

            ipattern += pattern_inc
            if (ipattern == (n_patterns - 1)) or (ipattern == 0):
                pattern_inc *= -1

            print ipattern

        stop = time.time()

        print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start))
        #cell_fb.fb_close(fb)

        return
Пример #50
0
def load_float(code, reg, val):
    data = extarray.extarray('f', (val, ))
    data.change_type('I')

    return load_word(code, reg, data[0])