Exemplo n.º 1
0
def TestParams():
    import time

    # SIZE = 1024
    kernel = """
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  """

    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    a = 1.0
    b = 2.0

    ptx_mem_addr = ptx_exec.alloc_device(4)
    mem = extarray.extarray("f", 1)
    # mem.set_memory(ptx_mem_addr, 4)
    mem[0] = 5.0

    print ptx_mem_addr, type(ptx_mem_addr)
    print mem.buffer_info()[0], type(mem.buffer_info()[0])
    param_list = [ptx_mem_addr, a, b]
    # image, dev num, (x, y, w, h)

    ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
    t2 = time.time()
    print "run time", t2 - t1
    print "X", mem.buffer_info()[0], ptx_mem_addr
    ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

    print param_list
    print mem

    # ptx_exec.free(input)
    # ptx_exec.free(output)
    ##ptx_exec.free(glob)
    # ptx_exec.unload_module(image)
    return
Exemplo n.º 2
0
  def cache_code(self):
    if self._cached == True:
      return
    
    self._synthesize_prologue()
    self._synthesize_epilogue()

    render_string = ''
    for stream in self.objects:
      render_string = self._cache_code_S(render_string, stream.objects)

    self.render_string = self._prologue + render_string + self._epilogue

    #print self.render_string
    self.render_code = ptx_exec.compile(self.render_string)
    self._cached = True
    return
Exemplo n.º 3
0
    def cache_code(self):
        if self._cached == True:
            return

        self._synthesize_prologue()
        self._synthesize_epilogue()

        render_string = ""
        for stream in self.objects:
            render_string = self._cache_code_S(render_string, stream.objects)

        self.render_string = self._prologue + render_string + self._epilogue

        # print self.render_string
        self.render_code = ptx_exec.compile(self.render_string)
        self._cached = True
        return
Exemplo n.º 4
0
def TestCompileExec():
    import time

    # SIZE = 1024
    kernel = (
        "\t.version 1.4\n"
        + "\t.target sm_10, map_f64_to_f32\n"
        + "\t.entry _main () {\n"
        + "\t\tret;\n"
        + "\t\texit;\n"
        + "\t}\n"
        + "\n"
    )
    print kernel

    # ctx = ptx_exec.alloc_ctx(0)
    t1 = time.time()
    module = ptx_exec.compile(kernel)
    t2 = time.time()
    print "compile time", t2 - t1

    # input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
    # output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
    ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER)
    # print "input", input
    # print "output", output

    # remote = {"o0": output, "i0": input}
    # local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4),
    #         "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)}
    # domain = (0, 0, SIZE, SIZE)
    # print "remote bindings", remote
    # print "local bindings", local

    print "Executing..."
    # image, dev num, (x, y, w, h)
    t1 = time.time()
    ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), [])
    t2 = time.time()
    print "run time", t2 - t1

    # ptx_exec.free_ctx(ctx)

    return
Exemplo n.º 5
0
def TestCompileExec():
  import time
  #SIZE = 1024
  kernel = ("\t.version 1.4\n" +
            "\t.target sm_10, map_f64_to_f32\n" +
            "\t.entry _main () {\n" +
            "\t\tret;\n" + 
            "\t\texit;\n" + 
            "\t}\n" +
            "\n")
  print kernel

  #ctx = ptx_exec.alloc_ctx(0)
  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  #input = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
  #output = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, SIZE, SIZE, 0)
  ##glob = ptx_exec.alloc_remote(ptx_exec.FMT_FLOAT32_4, 4096, 4096, ptx_exec.GLOBAL_BUFFER)
  #print "input", input
  #print "output", output

  #remote = {"o0": output, "i0": input}
  #local = {"o1": (SIZE, SIZE, ptx_exec.FMT_FLOAT32_4),
  #         "g[]": (4096, 4096, ptx_exec.FMT_FLOAT32_4)}
  #domain = (0, 0, SIZE, SIZE)
  #print "remote bindings", remote
  #print "local bindings", local

  print "Executing..."
  # image, dev num, (x, y, w, h)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (), [])
  t2 = time.time()
  print "run time", t2 - t1

  #ptx_exec.free_ctx(ctx)
  
  return
Exemplo n.º 6
0
def TestParams():
  import time
  #SIZE = 1024
  kernel = (
  '''
  .version 1.4
  .target sm_10, map_f64_to_f32
  .entry _main (
  .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a,
  .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b)
  {
  .reg .u64 %rd<3>;
  .reg .f32 %f<6>;
  ld.param.f32    %f1, [__cudaparm__Z16addArrayOnDevicePfff_a];
  ld.param.f32    %f2, [__cudaparm__Z16addArrayOnDevicePfff_b];
  add.f32 %f3, %f1, %f2;
  
  mov.f32         %f4, %f3;
  ld.param.u64    %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c];
  st.global.f32   [%rd1+0], %f4;

  exit;
  } // _Z16addArrayOnDevicePfff
  '''
  )

  t1 = time.time()
  module = ptx_exec.compile(kernel)
  t2 = time.time()
  print "compile time", t2 - t1

  a = 1.0
  b = 2.0

  ptx_mem_addr = ptx_exec.alloc_device(4)
  mem = extarray.extarray('f', 1)
  #mem.set_memory(ptx_mem_addr, 4)
  mem[0] = 5.0

  print ptx_mem_addr, type(ptx_mem_addr)
  print mem.buffer_info()[0], type(mem.buffer_info()[0])
  param_list = [ptx_mem_addr, a, b]
  # image, dev num, (x, y, w, h)

  ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4)
  t1 = time.time()
  ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list)
  t2 = time.time()
  print "run time", t2 - t1
  print "X", mem.buffer_info()[0], ptx_mem_addr
  ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4)

  print param_list
  print mem

  #ptx_exec.free(input)
  #ptx_exec.free(output)
  ##ptx_exec.free(glob)
  #ptx_exec.unload_module(image)
  return