예제 #1
0
  def store(self, addr, offset = 0, space='global'):
    if isinstance(offset, (int, long)):
      self.code.add(ptx.st(space, regs.ptxAddress(addr, offset), self))
    else:

      temp = self.code.prgm.acquire_register('u64')
      self.code.add(ptx.add(temp, addr, offset))
      self.code.add(ptx.st(space, regs.ptxAddress(temp), self))
      self.code.prgm.release_register(temp)
예제 #2
0
def TestParamsFull():
  import time
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()

  a = 1.0
  b = 2.0

  ptx_mem_addr = proc.alloc_device('f32', 1)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  param_list = [ptx_mem_addr.address, a, b]

  proc.copy(ptx_mem_addr, mem)
  prgm.cache_code()
  for i in range(20):
    t1 = time.time()
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "#####"
  print "X", mem.buffer_info()[0], ptx_mem_addr.address
  proc.copy(mem, ptx_mem_addr)

  print param_list
  print mem

  return
예제 #3
0
def TestParamsFull():
    import time
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()

    a = 1.0
    b = 2.0

    ptx_mem_addr = proc.alloc_device("f32", 1)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    param_list = [ptx_mem_addr.address, a, b]

    proc.copy(ptx_mem_addr, mem)
    prgm.cache_code()
    for i in range(20):
        t1 = time.time()
        proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
        t2 = time.time()
        print "run time", t2 - t1
        print "#####"
    print "X", mem.buffer_info()[0], ptx_mem_addr.address
    proc.copy(mem, ptx_mem_addr)

    print param_list
    print mem

    return
예제 #4
0
def TestSynIterDec():
    import corepy.arch.ptx.isa as ptx
    import corepy.arch.ptx.types.registers as regs

    SIZE = 64

    proc = env.Processor(0)

    # build and run the kernel
    prgm = env.Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter('u64', name='_mem')

    memp = prgm.acquire_register('u64')
    counter = prgm.acquire_register('u32')
    code.add(ptx.ld('param', memp, regs.ptxAddress(_mem)))
    code.add(ptx.mov(counter, 0))
    for i in syn_iter(code, 5, step=1, mode=DEC):
        code.add(ptx.add(counter, counter, 1))
    code.add(ptx.st('global', regs.ptxAddress(memp), counter))
    prgm.add(code)

    ptx_mem_addr = proc.alloc_device('u32', 1)
    mem = extarray.extarray('I', 1)
    mem[0] = 5

    param_list = [
        ptx_mem_addr.address,
    ]

    proc.copy(ptx_mem_addr, mem)
    prgm.cache_code()
    print prgm.render_string
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    proc.copy(mem, ptx_mem_addr)

    print mem

    #passed = True
    #for i in xrange(0, SIZE):
    #  if ext_output[i] != 5:
    #    passed = False
    #print "Passed == ", passed

    return
예제 #5
0
def TestSynIterDec():
  import corepy.arch.ptx.isa as ptx
  import corepy.arch.ptx.types.registers as regs

  SIZE = 64

  proc = env.Processor(0)

  # build and run the kernel
  prgm = env.Program()
  code = prgm.get_stream()

  _mem = prgm.add_parameter('u64', name='_mem')

  memp = prgm.acquire_register('u64')
  counter = prgm.acquire_register('u32')
  code.add(ptx.ld('param', memp, regs.ptxAddress(_mem)))
  code.add(ptx.mov(counter, 0))
  for i in syn_iter(code, 5, step=1, mode=DEC):
    code.add(ptx.add(counter, counter, 1))
  code.add(ptx.st('global', regs.ptxAddress(memp), counter))
  prgm.add(code)

  ptx_mem_addr = proc.alloc_device('u32', 1)
  mem = extarray.extarray('I', 1)
  mem[0] = 5

  param_list = [ptx_mem_addr.address,]

  proc.copy(ptx_mem_addr, mem)
  prgm.cache_code()
  print prgm.render_string
  proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
  proc.copy(mem, ptx_mem_addr)

  print mem

  #passed = True
  #for i in xrange(0, SIZE):
  #  if ext_output[i] != 5:
  #    passed = False
  #print "Passed == ", passed

  return
예제 #6
0
def TestSimpleKernel():
  import corepy.arch.ptx.isa as isa
  import corepy.arch.ptx.types.registers as regs
  import time

  SIZE = 128

  proc = Processor(0)

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  _mem = prgm.add_parameter('u64', name='_mem')
  _a = prgm.add_parameter('f32', name='_a')
  _b = prgm.add_parameter('f32', name='_b')

#  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
#  r1 = regs.ptxVariable('reg', 'f32', 'f1')
#  r2 = regs.ptxVariable('reg', 'f32', 'f2')
#  r3 = regs.ptxVariable('reg', 'f32', 'f3')
#  r4 = regs.ptxVariable('reg', 'f32', 'f4')
#  code.add('  .reg .u64 rd1;')
#  code.add('  .reg .f32 f1;')
#  code.add('  .reg .f32 f2;')
#  code.add('  .reg .f32 f3;')
#  code.add('  .reg .f32 f4;')

  rd1 = prgm.acquire_register('u64')
  r1 = prgm.acquire_register('f32')
  r2 = prgm.acquire_register('f32')
  r3 = prgm.acquire_register('f32')
  r4 = prgm.acquire_register('f32')    
  v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable

#  import pdb
#  pdb.set_trace()
  #code.add(isa.add(r3, r2, r1))
  #code.add('add.f32 r3, r2, r1;')
  code.add(isa.ld('param', r1, regs.ptxAddress(_a)))
  code.add(isa.ld('param', r2, regs.ptxAddress(_b)))
  code.add(isa.add(r3, r2, r1))
  code.add(isa.add(r3, r3, 1.0))
  code.add(isa.mov(r4, r3))
  #temp = prgm.acquire_register('u32')
  #code.add(isa.cvt(temp, regs.tid.x))
  #code.add(isa.cvt(r4, temp, rnd='rn'))
  temp1 = prgm.acquire_register('u32')
  temp2 = prgm.acquire_register('u32')
  temp3 = prgm.acquire_register('u32')
  code.add(isa.mul(temp2, temp1, temp3, hlw='lo'))
  
  code.add(isa.ld('param', rd1, regs.ptxAddress(_mem)))
  code.add(isa.st('global', regs.ptxAddress(rd1), r4))
  prgm.add(code)

  prgm.cache_code()
#   prgm.render_string = (
#   '''
#   .version 1.4
#   .target sm_10, map_f64_to_f32
#   .entry _main (
#   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
#   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
#   {
#   .reg .u64 %rd<3>;
#   .reg .f32 %f<6>;
#   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
#   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
#   add.f32 %f3, %f1, %f2;
  
#   mov.f32         %f4, %f3;
#   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
#   st.global.f32   [%rd1+0], %f4;

#   exit;
#   } // _Z16addArrayOnDevicePfff
#   '''
#   )
#   prgm.render_code = ptx_exec.compile(prgm.render_string)

  ####
  #ptx_mem_addr = proc.alloc_device('f32', 1)
  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  mem[0] = 5.0

  a = 1.0
  b = 2.0
  
  print mem.buffer_info()[0]
  param_list = [ptx_mem_addr, a, b]
  print map(type, param_list)
  #   # image, dev num, (x, y, w, h)

  #import pdb

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  #kernel = prgm.render_string
  #module = ptx_exec.compile(kernel)
  t1 = time.time()
  #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  proc.execute(prgm, (1,1,1,1,1), param_list)
  t2 = time.time()
#  pdb.set_trace()
  print "run time", t2 - t1

  print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
  print int(ptx_mem_addr)
  print int(mem.buffer_info()[0])
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem
  ####

  return
예제 #7
0
def TestSimpleKernel():
    import corepy.arch.ptx.isa as isa
    import corepy.arch.ptx.types.registers as regs
    import time

    SIZE = 128

    proc = Processor(0)

    # build and run the kernel
    prgm = Program()
    code = prgm.get_stream()

    _mem = prgm.add_parameter("u64", name="_mem")
    _a = prgm.add_parameter("f32", name="_a")
    _b = prgm.add_parameter("f32", name="_b")

    #  rd1 = regs.ptxVariable('reg', 'u64', 'rd1')
    #  r1 = regs.ptxVariable('reg', 'f32', 'f1')
    #  r2 = regs.ptxVariable('reg', 'f32', 'f2')
    #  r3 = regs.ptxVariable('reg', 'f32', 'f3')
    #  r4 = regs.ptxVariable('reg', 'f32', 'f4')
    #  code.add('  .reg .u64 rd1;')
    #  code.add('  .reg .f32 f1;')
    #  code.add('  .reg .f32 f2;')
    #  code.add('  .reg .f32 f3;')
    #  code.add('  .reg .f32 f4;')

    rd1 = prgm.acquire_register("u64")
    r1 = prgm.acquire_register("f32")
    r2 = prgm.acquire_register("f32")
    r3 = prgm.acquire_register("f32")
    r4 = prgm.acquire_register("f32")
    v1 = prgm.add_variable("shared", "f32")  # don't need this, but let's test add_variable

    #  import pdb
    #  pdb.set_trace()
    # code.add(isa.add(r3, r2, r1))
    # code.add('add.f32 r3, r2, r1;')
    code.add(isa.ld("param", r1, regs.ptxAddress(_a)))
    code.add(isa.ld("param", r2, regs.ptxAddress(_b)))
    code.add(isa.add(r3, r2, r1))
    code.add(isa.add(r3, r3, 1.0))
    code.add(isa.mov(r4, r3))
    # temp = prgm.acquire_register('u32')
    # code.add(isa.cvt(temp, regs.tid.x))
    # code.add(isa.cvt(r4, temp, rnd='rn'))
    temp1 = prgm.acquire_register("u32")
    temp2 = prgm.acquire_register("u32")
    temp3 = prgm.acquire_register("u32")
    code.add(isa.mul(temp2, temp1, temp3, hlw="lo"))

    code.add(isa.ld("param", rd1, regs.ptxAddress(_mem)))
    code.add(isa.st("global", regs.ptxAddress(rd1), r4))
    prgm.add(code)

    prgm.cache_code()
    #   prgm.render_string = (
    #   '''
    #   .version 1.4
    #   .target sm_10, map_f64_to_f32
    #   .entry _main (
    #   .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
    #   .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
    #   {
    #   .reg .u64 %rd<3>;
    #   .reg .f32 %f<6>;
    #   ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
    #   ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
    #   add.f32 %f3, %f1, %f2;

    #   mov.f32         %f4, %f3;
    #   ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
    #   st.global.f32   [%rd1+0], %f4;

    #   exit;
    #   } // _Z16addArrayOnDevicePfff
    #   '''
    #   )
    #   prgm.render_code = ptx_exec.compile(prgm.render_string)

    ####
    # ptx_mem_addr = proc.alloc_device('f32', 1)
    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    mem[0] = 5.0

    a = 1.0
    b = 2.0

    print mem.buffer_info()[0]
    param_list = [ptx_mem_addr, a, b]
    print map(type, param_list)
    #   # image, dev num, (x, y, w, h)

    # import pdb

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    # kernel = prgm.render_string
    # module = ptx_exec.compile(kernel)
    t1 = time.time()
    # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    proc.execute(prgm, (1, 1, 1, 1, 1), param_list)
    t2 = time.time()
    #  pdb.set_trace()
    print "run time", t2 - t1

    print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr)
    print int(ptx_mem_addr)
    print int(mem.buffer_info()[0])
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem
    ####

    return