Пример #1
0
def TestParams():
    import time

    # SIZE = 1024
    kernel = """
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  """

    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    a = 1.0
    b = 2.0

    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    # mem.set_memory(ptx_mem_addr, 4)
    mem[0] = 5.0

    print ptx_mem_addr, type(ptx_mem_addr)
    print mem.buffer_info()[0], type(mem.buffer_info()[0])
    param_list = [ptx_mem_addr, a, b]
    # image, dev num, (x, y, w, h)

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "X", mem.buffer_info()[0], ptx_mem_addr
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem

    # ptx_exec.free(input)
    # ptx_exec.free(output)
    ##ptx_exec.free(glob)
    # ptx_exec.unload_module(image)
    return
Пример #2
0
def TestCompileExec():
    import time

    # SIZE = 1024
    kernel = (
        "\t.version 1.4\n"
        + "\t.target sm_10, map_f64_to_f32\n"
        + "\t.entry _main () {\n"
        + "\t\tret;\n"
        + "\t\texit;\n"
        + "\t}\n"
        + "\n"
    )
    print kernel

    # ctx = ptx_exec.alloc_ctx(0)
    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    # input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
    # output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
    ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER)
    # print "input", input
    # print "output", output

    # remote = {"o0": output, "i0": input}
    # local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4),
    #         "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)}
    # domain = (0, 0, SIZE, SIZE)
    # print "remote bindings", remote
    # print "local bindings", local

    print "Executing..."
    # image, dev num, (x, y, w, h)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), [])
    t2 = time.time()
    print "run time", t2 - t1

    # ptx_exec.free_ctx(ctx)

    return
Пример #3
0
def TestCompileExec():
  import time
  #SIZE = 1024
  kernel = ("\t.version 1.4\n" +
            "\t.target sm_10, map_f64_to_f32\n" +
            "\t.entry _main () {\n" +
            "\t\tret;\n" + 
            "\t\texit;\n" + 
            "\t}\n" +
            "\n")
  print kernel

  #ctx = ptx_exec.alloc_ctx(0)
  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  #input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
  #output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
  ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER)
  #print "input", input
  #print "output", output

  #remote = {"o0": output, "i0": input}
  #local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4),
  #         "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)}
  #domain = (0, 0, SIZE, SIZE)
  #print "remote bindings", remote
  #print "local bindings", local

  print "Executing..."
  # image, dev num, (x, y, w, h)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), [])
  t2 = time.time()
  print "run time", t2 - t1

  #ptx_exec.free_ctx(ctx)
  
  return
Пример #4
0
def TestParams():
  import time
  #SIZE = 1024
  kernel = (
  '''
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  '''
  )

  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  a = 1.0
  b = 2.0

  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  #mem.set_memory(ptx_mem_addr, 4)
  mem[0] = 5.0

  print ptx_mem_addr, type(ptx_mem_addr)
  print mem.buffer_info()[0], type(mem.buffer_info()[0])
  param_list = [ptx_mem_addr, a, b]
  # image, dev num, (x, y, w, h)

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  t2 = time.time()
  print "run time", t2 - t1
  print "X", mem.buffer_info()[0], ptx_mem_addr
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem

  #ptx_exec.free(input)
  #ptx_exec.free(output)
  ##ptx_exec.free(glob)
  #ptx_exec.unload_module(image)
  return
Пример #5
0
        #for i, param in enumerate(params):  
        #  pt = prgrm.param_types[i]
        #  #  if !isinstance(param, prgm.params[i]):
        #  #    raise Exception("Invalid parameter type at parameter " + str(i))
        pass

      param_list = list(params)

      # Replace DeviceMemory parameters with their actual address
      # TODO - any other swaps than need to be done?
      for i in xrange(0, len(param_list)):
        if isinstance(param_list[i], DeviceMemory):
            param_list[i] = param_list[i].address

      type_num_tuple = tuple(map(ptx_exec.__dict__.__getitem__, prgm.param_types))
      ptx_exec.run_stream(prgm.render_code, threads, type_num_tuple, param_list)
      # ptx_exec.run_stream(prgm.render_code, self.ctx, threads, tuple(prgm.param_types), param_list)

      return
    
  def join(self, hdl):
    # TODO - do something better to differentiate
    if len(hdl) == 2:
      # Join a kernel execution
      (th, prgm) = hdl
      ptx_exec.join_stream(th)

      for arr in prgm._remote_bindings_data.values():
        binding = prgm._bindings[key]
        if isinstance(arr, extarray.extarray):
          arr.set_memory(bindings[1], arr.data_len * arr.itemsize)
Пример #6
0
                # for i, param in enumerate(params):
                #  pt = prgrm.param_types[i]
                #  #  if !isinstance(param, prgm.params[i]):
                #  #    raise Exception("Invalid parameter type at parameter " + str(i))
                pass

            param_list = list(params)

            # Replace DeviceMemory parameters with their actual address
            # TODO - any other swaps than need to be done?
            for i in xrange(0, len(param_list)):
                if isinstance(param_list[i], DeviceMemory):
                    param_list[i] = param_list[i].address

            type_num_tuple = tuple(map(ptx_exec.__dict__.__getitem__, prgm.param_types))
            ptx_exec.run_stream(prgm.render_code, threads, type_num_tuple, param_list)
            # ptx_exec.run_stream(prgm.render_code, self.ctx, threads, tuple(prgm.param_types), param_list)

            return

    def join(self, hdl):
        # TODO - do something better to differentiate
        if len(hdl) == 2:
            # Join a kernel execution
            (th, prgm) = hdl
            ptx_exec.join_stream(th)

            for arr in prgm._remote_bindings_data.values():
                binding = prgm._bindings[key]
                if isinstance(arr, extarray.extarray):
                    arr.set_memory(bindings[1], arr.data_len * arr.itemsize)