Пример #1
0
def test_4comp():
    proc = env.Processor(0)
    prgm = env.Program()
    code = prgm.get_stream()

    inp = proc.alloc_remote('i', 1, 4, 1)
    out = proc.alloc_remote('i', 4, 1, 1)

    for i in xrange(0, 4):
        inp[i] = i + 1
        out[i] = 0

    print "inp", inp[0:4]
    print "out", out[0:4]

    cal.set_active_code(code)

    cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
    cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float,
                     UNNORM=True)  # positions

    r_cnt = prgm.acquire_register()
    r = prgm.acquire_registers(4)

    cal.mov(r_cnt, r_cnt('0000'))

    for i in xrange(0, 4):
        cal.sample(0, 0, r[i].x000, r_cnt.x)
        cal.add(r_cnt, r_cnt, r_cnt('1111'))

    cal.iadd(r[0], r[0], r[1]('0x00'))
    cal.iadd(r[0], r[0], r[2]('00x0'))
    cal.iadd(r[0], r[0], r[3]('000x'))
    cal.iadd(r[0], r[0], r[0])
    cal.mov(reg.o0, r[0])

    prgm.set_binding(reg.i0, inp)
    prgm.set_binding(reg.o0, out)

    prgm.add(code)
    prgm.print_code()

    proc.execute(prgm, (0, 0, 1, 1))

    print "inp", inp[0:4]
    print "out", out[0:4]
    for i in xrange(0, 4):
        assert (out[i] == (i + 1) * 2)
    return
Пример #2
0
def test_4comp():
  proc = env.Processor(0)
  prgm = env.Program()
  code = prgm.get_stream()

  inp = proc.alloc_remote('i', 1, 4, 1)
  out = proc.alloc_remote('i', 4, 1, 1)

  for i in xrange(0, 4):
    inp[i] = i + 1
    out[i] = 0

  print "inp", inp[0:4]
  print "out", out[0:4]
  
  cal.set_active_code(code)

  cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
  cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions

  r_cnt = prgm.acquire_register()
  r = prgm.acquire_registers(4)

  cal.mov(r_cnt, r_cnt('0000'))

  for i in xrange(0, 4):
    cal.sample(0, 0, r[i].x000, r_cnt.x)
    cal.add(r_cnt, r_cnt, r_cnt('1111'))

  cal.iadd(r[0], r[0], r[1]('0x00'))
  cal.iadd(r[0], r[0], r[2]('00x0'))
  cal.iadd(r[0], r[0], r[3]('000x'))
  cal.iadd(r[0], r[0], r[0])
  cal.mov(reg.o0, r[0])

  prgm.set_binding(reg.i0, inp)
  prgm.set_binding(reg.o0, out)

  prgm.add(code)
  prgm.print_code()

  proc.execute(prgm, (0, 0, 1, 1))

  print "inp", inp[0:4]
  print "out", out[0:4]
  for i in xrange(0, 4):
    assert(out[i] == (i + 1) * 2)
  return
Пример #3
0
def TestSimpleKernelNPy():
  import corepy.arch.cal.isa as isa

  SIZE = 128

  proc = Processor(0)

  arr_input = proc.alloc_remote_npy('f', 4, SIZE, SIZE)
  arr_output = proc.alloc_remote_npy('f', 4, SIZE, SIZE)

  #for i in xrange(0, SIZE * SIZE * 4):
  #  arr_input[i] = float(i + 1)
  #  arr_output[i] = 0.0
  #print arr_input.shape
  #print arr_output.shape
  #print type(arr_input.data)

  val = 0.0
  for i in xrange(0, SIZE):
    for j in xrange(0, SIZE):
      for k in xrange(0, 4):
        arr_input[i][j][k] = val
        arr_output[i][j][k] = 0.0
        val += 1.0

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  #code.add(isa.dcl_input('v0', USAGE=isa.usage.pos, INTERP='linear_noperspective'))
  code.add("dcl_input_position_interp(constant) v0.xy__")
  code.add(isa.dcl_output('o0', USAGE=isa.usage.generic))
  code.add(isa.dcl_resource(0, '2d', isa.fmt.float, UNNORM=True))
  code.add(isa.sample(0, 0, 'o0', 'v0.xy'))
  #code.add(isa.load(0, 'o0', 'v0.g'))

  domain = (0, 0, SIZE, SIZE)
  prgm.set_binding("o0", arr_output)
  prgm.set_binding("i0", arr_input)

  prgm.add(code)
  prgm.cache_code()
  prgm.print_code()

  proc.execute(prgm, domain)

  # Check the output
  val = 0.0
  for i in xrange(0, SIZE):
    for j in xrange(0, SIZE):
      for k in xrange(0, 4):
        if arr_output[i][j][k] != val:
          print "ERROR index %d is %f, should be %f" % (i, arr_output[i], val)
        val += 1.0

  return
Пример #4
0
def TestSimpleKernel():
  import corepy.arch.cal.isa as isa

  SIZE = 128

  proc = Processor(0)

  ext_input = proc.alloc_remote('f', 4, SIZE, SIZE)
  ext_output = proc.alloc_remote('f', 4, SIZE, SIZE)

  for i in xrange(0, SIZE * SIZE * 4):
    ext_input[i] = float(i + 1)
    ext_output[i] = 0.0

  # build and run the kernel
  prgm = Program()
  code = prgm.get_stream()  

  #code.add(isa.dcl_input('v0', USAGE=isa.usage.pos, INTERP='linear_noperspective'))
  code.add("dcl_input_position_interp(constant) v0.xy__")
  code.add(isa.dcl_output('o0', USAGE=isa.usage.generic))
  code.add(isa.dcl_resource(0, '2d', isa.fmt.float, UNNORM=True))
  code.add(isa.sample(0, 0, 'o0', 'v0.xy'))
  #code.add(isa.load(0, 'o0', 'v0.g'))

  domain = (0, 0, SIZE, SIZE)
  prgm.set_binding("o0", ext_output)
  prgm.set_binding("i0", ext_input)

  prgm.add(code)
  prgm.cache_code()
  prgm.print_code()

  proc.execute(prgm, domain)

  # Check the output
  for i in xrange(0, SIZE * SIZE * 4):
    if ext_output[i] != float(i + 1):
      print "ERROR index %d is %f, should be %f" % (i, ext_output[i], float(i + 1))

  proc.free(ext_input)
  proc.free(ext_output)
  return
Пример #5
0
proc = env.Processor(0)

prgm = env.Program()
code = prgm.get_stream()

inp = proc.alloc_remote('f', 4, 64)
out = proc.alloc_remote('f', 4, 64)

out.clear()
for i in xrange(0, 64):
  inp[i] = float(i + 1)

cal.set_active_code(code)

cal.dcl_input(reg.v0.x, USAGE=cal.usage.pos)
cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM = True)
cal.dcl_output(reg.o0, USAGE=cal.usage.generic)

cal.sample(0, 0, reg.o0, reg.v0.x)

prgm.set_binding(reg.i0, inp)
prgm.set_binding(reg.o0, out)

prgm.add(code)
prgm.print_code()

proc.execute(prgm)

print "inp", inp
print "out", out
Пример #6
0
def cal_nb_generate(n_bodies, dt):
  code = env.InstructionStream()
  cal.set_active_code(code)
  fn_bodies = float(n_bodies)

  r_count = code.acquire_register()
  r_lpos = code.acquire_register()
  r_rpos = code.acquire_register()
  r_force = code.acquire_register()
  r_diff = code.acquire_register()
  r_dist_vec = code.acquire_register()
  r_dist = code.acquire_register()
  r_force_tmp = code.acquire_register()
  r_force_vec = code.acquire_register()
  r_vel = code.acquire_register()

  #code.add("dcl_input_position_interp(linear_noperspective) v0.x___")
  cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective)
  r_bodies = code.acquire_register((fn_bodies,) * 4)
  r_G = code.acquire_register((G,) * 4)
  r_dt = code.acquire_register((dt,) * 4)
  cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
  cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
  cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # positions
  cal.dcl_resource(1, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True) # velocities

  # Loop over all other points to calculate the force
  cal.mov(r_count, r_count('0000'))                  # loop counter
  cal.sample(0, 0, r_lpos, reg.v0.x)        # Local position
  cal.mov(r_force, r_force('0000'))                  # total force


  # Compute force using input from every other point
  cal.whileloop()
  # Break if end of points reached
  cal.breakc(cal.relop.ge, r_count, r_bodies)

  cal.sample(0, 0, r_rpos, r_count.x)       # Remote position

  # d_xyz
  cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0)   # local pos - remote pos

  # dist_tmp
  cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx)
  cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec)
  cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec)
  
  # distance
  # TODO - skip rest of force computation if distance is 0
  cal.sqrt_vec(r_dist, r_dist_vec)

  # force G * ((m[i] * m[j]) / dist_tmp)
  cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww)
  cal.div(cal.zeroop.zero, r_force_tmp, r_force_tmp, r_dist_vec)
  cal.mul(r_force_tmp, r_force_tmp, r_G)

  # f_xyz
  cal.div(cal.zeroop.zero, r_force_vec, r_diff.xyz0, r_dist.xyz1)
  cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0)

  cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0)


  # Increment loop counter, end loop
  cal.add(r_count, r_count, r_count('1111'))
  cal.endloop()

  # Acceleration
  cal.div(cal.zeroop.zero, r_force, r_force.xyz0, r_lpos.wwww)

  # Velocity
  cal.sample(1, 1, r_vel, reg.v0.x)    # Load velocity
  cal.mad(r_vel, r_force, r_dt, r_vel)
  cal.mov(reg.o1, r_vel)

  # Position
  cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw)

  return code
Пример #7
0
def cal_nb_generate_local(n_bodies, dt, steps):
  code = env.InstructionStream()
  cal.set_active_code(code)
  fn_bodies = float(n_bodies)
  steps = float(steps)

  r_count = code.acquire_register()
  r_step = code.acquire_register()
  r_lpos = code.acquire_register()
  r_rpos = code.acquire_register()
  r_force = code.acquire_register()
  r_diff = code.acquire_register()
  r_dist_vec = code.acquire_register()
  r_dist = code.acquire_register()
  r_force_tmp = code.acquire_register()
  r_force_vec = code.acquire_register()
  r_vel = code.acquire_register()

  print "fn_bodies", fn_bodies

  code.add("dcl_input_position_interp(linear_noperspective) v0.xy__")
  #cal.dcl_input(reg.v0.x___, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective)
  r_numsteps = code.acquire_register((steps,) * 4)
  r_bodies = code.acquire_register((fn_bodies,) * 4)
  #r_bodiesquare = code.acquire_register((float(fn_bodies**2),) * 4)
  r_G = code.acquire_register((G,) * 4)
  r_dt = code.acquire_register((dt,) * 4)
  cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
  cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
  cal.dcl_output(reg.o2, USAGE=cal.usage.generic)
  cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions
  cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities

  r_foo = code.acquire_register()
  cal.mov(r_foo, r_foo('0000'))

  r_gpos = code.acquire_register()
  cal.mad(r_gpos, reg.v0.y, r_bodies.x, reg.v0.x)

  r_gvel = code.acquire_register()
  cal.mad(r_gvel, r_bodies.x, r_bodies.x, r_gpos)

  cal.ftoi(r_gpos, r_gpos)
  cal.ftoi(r_gvel, r_gvel)

  cal.sample(0, 0, r_lpos, reg.v0.xy)                # Local position
  cal.sample(1, 1, r_vel, reg.v0.xy)    # Load velocity

  cal.mov(reg.g[r_gpos.x], r_lpos)
  cal.mov(reg.g[r_gvel.x], r_vel)

  cal.mov(r_step, r_step('0000'))

  cal.whileloop()
  cal.breakc(cal.relop.ge, r_step.x, r_numsteps)

  cal.mov(r_count, r_count('0000'))                  # loop counter

  cal.whileloop()
  cal.breakc(cal.relop.ge, r_count.x, r_bodies)

  cal.add(r_foo, r_foo, r_foo('1111'))

  # calculate force
  r_tmp = code.acquire_register()
  cal.ftoi(r_tmp, r_count)

  cal.mov(r_rpos, reg.g[r_tmp.x])

  # d_xyz
  cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0)   # local pos - remote pos

  # dist_tmp
  cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx)
  cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec)
  cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec)
  
  # distance
  # TODO - skip rest of force computation if distance is 0
  cal.sqrt_vec(r_dist, r_dist_vec)

  # force G * ((m[i] * m[j]) / dist_tmp)
  cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww)
  cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero)
  cal.mul(r_force_tmp, r_force_tmp, r_G)

  # f_xyz
  # TODO - whats going on, is this right?
  cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero)
  cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0)

  cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0)

  cal.add(r_count, r_count, r_count('1111'))
  cal.endloop()

  # Acceleration
  cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero)

  # Velocity
  cal.mad(r_vel, r_force, r_dt, r_vel)

  # Position
  cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw)

  # store updated pos and vel
  cal.mov(reg.g[r_gpos.x], r_lpos)
  cal.mov(reg.g[r_gvel.x], r_vel)

  cal.add(r_step, r_step, r_step('1111'))
  cal.endloop()

  cal.mov(reg.o0, r_lpos)
  cal.mov(reg.o1, r_vel)
  cal.mov(reg.o2, r_foo)
  return code
Пример #8
0
def cal_nb_generate_2d(prgm, n_bodies, dt):
  code = prgm.get_stream()
  cal.set_active_code(code)
  fn_bodies = float(n_bodies)

  #r_cx = prgm.acquire_register()
  #r_cy = prgm.acquire_register()
  r_count = prgm.acquire_register()
  r_lpos = prgm.acquire_register()
  r_rpos = prgm.acquire_register()
  r_force = prgm.acquire_register()
  r_diff = prgm.acquire_register()
  r_dist_vec = prgm.acquire_register()
  r_dist = prgm.acquire_register()
  r_force_tmp = prgm.acquire_register()
  r_force_vec = prgm.acquire_register()
  r_vel = prgm.acquire_register()

  #code.add("dcl_input_position_interp(linear_noperspective) v0.xy__")
  cal.dcl_input(reg.v0.xy__, USAGE=cal.usage.pos, INTERP=cal.interp.linear_noperspective)
  r_bodies = prgm.acquire_register((fn_bodies,) * 4)
  r_G = prgm.acquire_register((G,) * 4)
  r_dt = prgm.acquire_register((dt,) * 4)
  cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
  cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
  cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # positions
  cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.float, UNNORM=True) # velocities

  # Loop over all other points to calculate the force
  cal.mov(r_count, r_count('0000'))                  # loop counter
  #cal.mov(r_cx, r_cx('0000'))                  # loop counter
  #cal.mov(r_cy, r_cy('0000'))                  # loop counter
  cal.sample(0, 0, r_lpos, reg.v0.xy)                # Local position
  cal.mov(r_force, r_force('0000'))                  # total force


  # Compute force using input from every other point
  cal.whileloop()
  cal.breakc(cal.relop.ge, r_count.x, r_bodies)

  cal.mov(r_count, r_count.x0zw)

  cal.whileloop()
  cal.breakc(cal.relop.ge, r_count.y, r_bodies)

  #for i in xrange(0, 4):
  #cal.add(r_count, r_cx('x000'), r_cy('0x00'))
  cal.sample(0, 0, r_rpos, r_count.xy)       # Remote position

  # d_xyz
  cal.sub(r_diff, r_lpos.xyz0, r_rpos.xyz0)   # local pos - remote pos

  # dist_tmp
  #cal.mul(r_dist_vec, r_diff.xxxx, r_diff.xxxx)
  #cal.mad(r_dist_vec, r_diff.yyyy, r_diff.yyyy, r_dist_vec)
  #cal.mad(r_dist_vec, r_diff.zzzz, r_diff.zzzz, r_dist_vec)
  cal.dp3(r_dist_vec, r_diff, r_diff, IEEE = False)
  
  # distance
  # TODO - skip rest of force computation if distance is 0
  cal.sqrt_vec(r_dist, r_dist_vec)

  # force G * ((m[i] * m[j]) / dist_tmp)
  cal.mul(r_force_tmp, r_lpos.wwww, r_rpos.wwww, IEEE = False)
  cal.div(r_force_tmp, r_force_tmp, r_dist_vec, ZEROOP = cal.zeroop.zero)
  cal.mul(r_force_tmp, r_force_tmp, r_G, IEEE = False)

  # f_xyz
  # TODO - whats going on, is this right?
  cal.div(r_force_vec, r_diff.xyz0, r_dist.xyz1, ZEROOP = cal.zeroop.zero)
  cal.mul(r_force_vec, r_force_vec.xyz0, r_force_tmp.xyz0, IEEE = False)

  cal.sub(r_force, r_force.xyz0, r_force_vec.xyz0)

  #cal.add(r_cy, r_cy, r_count('1111'))

  #cal.add(r_count, r_count, r_count('0100'))
  #cal.ifc(cal.relop.ge, r_count.y, r_bodies.y)
  ## TODO - can I merge these two?
  #cal.mov(r_count('_y__'), r_count('x0zw'))
  #cal.add(r_count, r_count, r_count('1000'))
  #cal.endif()

  # Increment loop counter, end loop
  cal.add(r_count, r_count, r_count('0100'))
  cal.endloop()

  cal.add(r_count, r_count, r_count('1000'))
  #cal.add(r_cx, r_cx, r_cx('1111'))
  cal.endloop()

  # Acceleration
  cal.div(r_force, r_force.xyz0, r_lpos.wwww, ZEROOP = cal.zeroop.zero)

  # Velocity
  cal.sample(1, 1, r_vel, reg.v0.xy)    # Load velocity
  cal.mad(r_vel, r_force, r_dt, r_vel, IEEE = False)
  cal.mov(reg.o1, r_vel)

  # Position
  cal.mad(reg.o0, r_vel.xyz0, r_dt.xyz0, r_lpos.xyzw, IEEE = False)

  #cal.mov(reg.g[0], r_vel)

  return code
Пример #9
0
def ParMD5Transform(parcontext, parblock, blocki):
    num = parcontext.number

    temp_block = extarray.extarray('I', 16 * num)
    ParDecode(num, temp_block, parblock, blocki, 64)

    proc = env.Processor(0)

    N = int(math.sqrt(num / 4))

    #print "N = ", N
    def address_4_1d(i, pitch=64):
        x = i % N
        y = i // 64 * 4
        #return x*4 + y*pitch*4*4
        return i

    def address_4_2d(x, y, pitch=64):
        return x * 4 + y * pitch * 4

    input_statea = proc.alloc_remote('I', 4, N, N)
    input_stateb = proc.alloc_remote('I', 4, N, N)
    input_statec = proc.alloc_remote('I', 4, N, N)
    input_stated = proc.alloc_remote('I', 4, N, N)
    input_block = [proc.alloc_remote('I', 4, N, N) for i in range(16)]
    outputa = proc.alloc_remote('I', 4, N, N)
    outputb = proc.alloc_remote('I', 4, N, N)
    outputc = proc.alloc_remote('I', 4, N, N)
    outputd = proc.alloc_remote('I', 4, N, N)

    for j in range(N):
        for i in range(N):
            for k in range(4):
                input_statea[address_4_2d(i, j) +
                             k] = parcontext.statea[k + (i + j * N) * 4]
                input_stateb[address_4_2d(i, j) +
                             k] = parcontext.stateb[k + (i + j * N) * 4]
                input_statec[address_4_2d(i, j) +
                             k] = parcontext.statec[k + (i + j * N) * 4]
                input_stated[address_4_2d(i, j) +
                             k] = parcontext.stated[k + (i + j * N) * 4]
    for k in range(N):
        for j in range(N):
            for l in range(4):
                for i in range(16):
                    input_block[i][address_4_2d(j, k) +
                                   l] = temp_block[i + (j + k * N) * 4 * 16 +
                                                   l * 16]

    global xcode
    if xcode == None:
        xcode = env.InstructionStream()
        cal.set_active_code(xcode)

        S11 = xcode.acquire_register((7, 7, 7, 7))
        S12 = xcode.acquire_register((12, 12, 12, 12))
        S13 = xcode.acquire_register((17, 17, 17, 17))
        S14 = xcode.acquire_register((22, 22, 22, 22))
        S21 = xcode.acquire_register((5, 5, 5, 5))
        S22 = xcode.acquire_register((9, 9, 9, 9))
        S23 = xcode.acquire_register((14, 14, 14, 14))
        S24 = xcode.acquire_register((20, 20, 20, 20))
        S31 = xcode.acquire_register((4, 4, 4, 4))
        S32 = xcode.acquire_register((11, 11, 11, 11))
        S33 = xcode.acquire_register((16, 16, 16, 16))
        S34 = xcode.acquire_register((23, 23, 23, 23))
        S41 = xcode.acquire_register((6, 6, 6, 6))
        S42 = xcode.acquire_register((10, 10, 10, 10))
        S43 = xcode.acquire_register((15, 15, 15, 15))
        S44 = xcode.acquire_register((21, 21, 21, 21))

        a = xcode.acquire_register()
        b = xcode.acquire_register()
        c = xcode.acquire_register()
        d = xcode.acquire_register()
        x = [xcode.acquire_register() for i in range(16)]
        r = xcode.acquire_register()

        cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # statea
        cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # stateb
        cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # statec
        cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint,
                         UNNORM=True)  # stated
        for i in range(16):
            cal.dcl_resource(i + 4,
                             cal.pixtex_type.twod,
                             cal.fmt.uint,
                             UNNORM=True)
        cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o2, USAGE=cal.usage.generic)
        cal.dcl_output(reg.o3, USAGE=cal.usage.generic)

        cal.sample(0, 0, a, reg.v0.xy)
        cal.sample(1, 0, b, reg.v0.xy)
        cal.sample(2, 0, c, reg.v0.xy)
        cal.sample(3, 0, d, reg.v0.xy)

        for i in range(16):
            cal.sample(i + 4, 0, x[i], reg.v0.xy)

            # Round 1
        FF(a, b, c, d, x[0], S11, 0xd76aa478)
        # 1
        FF(d, a, b, c, x[1], S12, 0xe8c7b756)
        # 2
        FF(c, d, a, b, x[2], S13, 0x242070db)
        # 3
        FF(b, c, d, a, x[3], S14, 0xc1bdceee)
        # 4
        FF(a, b, c, d, x[4], S11, 0xf57c0faf)
        # 5
        FF(d, a, b, c, x[5], S12, 0x4787c62a)
        # 6
        FF(c, d, a, b, x[6], S13, 0xa8304613)
        # 7
        FF(b, c, d, a, x[7], S14, 0xfd469501)
        # 8
        FF(a, b, c, d, x[8], S11, 0x698098d8)
        # 9
        FF(d, a, b, c, x[9], S12, 0x8b44f7af)
        # 10
        FF(c, d, a, b, x[10], S13, 0xffff5bb1)
        # 11
        FF(b, c, d, a, x[11], S14, 0x895cd7be)
        # 12
        FF(a, b, c, d, x[12], S11, 0x6b901122)
        # 13
        FF(d, a, b, c, x[13], S12, 0xfd987193)
        # 14
        FF(c, d, a, b, x[14], S13, 0xa679438e)
        # 15
        FF(b, c, d, a, x[15], S14, 0x49b40821)
        # 16

        # Round 2
        GG(a, b, c, d, x[1], S21, 0xf61e2562)
        # 17
        GG(d, a, b, c, x[6], S22, 0xc040b340)
        # 18
        GG(c, d, a, b, x[11], S23, 0x265e5a51)
        # 19
        GG(b, c, d, a, x[0], S24, 0xe9b6c7aa)
        # 20
        GG(a, b, c, d, x[5], S21, 0xd62f105d)
        # 21
        GG(d, a, b, c, x[10], S22, 0x2441453)
        # 22
        GG(c, d, a, b, x[15], S23, 0xd8a1e681)
        # 23
        GG(b, c, d, a, x[4], S24, 0xe7d3fbc8)
        # 24
        GG(a, b, c, d, x[9], S21, 0x21e1cde6)
        # 25
        GG(d, a, b, c, x[14], S22, 0xc33707d6)
        # 26
        GG(c, d, a, b, x[3], S23, 0xf4d50d87)
        # 27
        GG(b, c, d, a, x[8], S24, 0x455a14ed)
        # 28
        GG(a, b, c, d, x[13], S21, 0xa9e3e905)
        # 29
        GG(d, a, b, c, x[2], S22, 0xfcefa3f8)
        # 30
        GG(c, d, a, b, x[7], S23, 0x676f02d9)
        # 31
        GG(b, c, d, a, x[12], S24, 0x8d2a4c8a)
        # 32

        # Round 3
        HH(a, b, c, d, x[5], S31, 0xfffa3942)
        # 33
        HH(d, a, b, c, x[8], S32, 0x8771f681)
        # 34
        HH(c, d, a, b, x[11], S33, 0x6d9d6122)
        # 35
        HH(b, c, d, a, x[14], S34, 0xfde5380c)
        # 36
        HH(a, b, c, d, x[1], S31, 0xa4beea44)
        # 37
        HH(d, a, b, c, x[4], S32, 0x4bdecfa9)
        # 38
        HH(c, d, a, b, x[7], S33, 0xf6bb4b60)
        # 39
        HH(b, c, d, a, x[10], S34, 0xbebfbc70)
        # 40
        HH(a, b, c, d, x[13], S31, 0x289b7ec6)
        # 41
        HH(d, a, b, c, x[0], S32, 0xeaa127fa)
        # 42
        HH(c, d, a, b, x[3], S33, 0xd4ef3085)
        # 43
        HH(b, c, d, a, x[6], S34, 0x4881d05)
        # 44
        HH(a, b, c, d, x[9], S31, 0xd9d4d039)
        # 45
        HH(d, a, b, c, x[12], S32, 0xe6db99e5)
        # 46
        HH(c, d, a, b, x[15], S33, 0x1fa27cf8)
        # 47
        HH(b, c, d, a, x[2], S34, 0xc4ac5665)
        # 48

        # Round 4
        II(a, b, c, d, x[0], S41, 0xf4292244)
        # 49
        II(d, a, b, c, x[7], S42, 0x432aff97)
        # 50
        II(c, d, a, b, x[14], S43, 0xab9423a7)
        # 51
        II(b, c, d, a, x[5], S44, 0xfc93a039)
        # 52
        II(a, b, c, d, x[12], S41, 0x655b59c3)
        # 53
        II(d, a, b, c, x[3], S42, 0x8f0ccc92)
        # 54
        II(c, d, a, b, x[10], S43, 0xffeff47d)
        # 55
        II(b, c, d, a, x[1], S44, 0x85845dd1)
        # 56
        II(a, b, c, d, x[8], S41, 0x6fa87e4f)
        # 57
        II(d, a, b, c, x[15], S42, 0xfe2ce6e0)
        # 58
        II(c, d, a, b, x[6], S43, 0xa3014314)
        # 59
        II(b, c, d, a, x[13], S44, 0x4e0811a1)
        # 60
        II(a, b, c, d, x[4], S41, 0xf7537e82)
        # 61
        II(d, a, b, c, x[11], S42, 0xbd3af235)
        # 62
        II(c, d, a, b, x[2], S43, 0x2ad7d2bb)
        # 63
        II(b, c, d, a, x[9], S44, 0xeb86d391)
        # 64

        cal.mov('o0', a)
        cal.mov('o1', b)
        cal.mov('o2', c)
        cal.mov('o3', d)

        xcode.release_register(a)
        xcode.release_register(b)
        xcode.release_register(c)
        xcode.release_register(d)
        for xi in x:
            xcode.release_register(xi)

    xcode.set_remote_binding('i0', input_statea)
    xcode.set_remote_binding('i1', input_stateb)
    xcode.set_remote_binding('i2', input_statec)
    xcode.set_remote_binding('i3', input_stated)
    for i in range(16):  #range(len(input_block)):
        xcode.set_remote_binding('i' + str(i + 4), input_block[i])
    xcode.set_remote_binding('o0', outputa)
    xcode.set_remote_binding('o1', outputb)
    xcode.set_remote_binding('o2', outputc)
    xcode.set_remote_binding('o3', outputd)

    domain = (0, 0, N, N)
    global TIME
    start_time = time.time()
    proc.execute(xcode, domain)
    end_time = time.time()
    TIME += (end_time - start_time)
    for j in range(N):
        for i in range(N):
            for k in range(4):
                parcontext.statea[k + (i + j * N) *
                                  4] += outputa[address_4_2d(i, j) + k]
                parcontext.stateb[k + (i + j * N) *
                                  4] += outputb[address_4_2d(i, j) + k]
                parcontext.statec[k + (i + j * N) *
                                  4] += outputc[address_4_2d(i, j) + k]
                parcontext.stated[k + (i + j * N) *
                                  4] += outputd[address_4_2d(i, j) + k]

    proc.free_remote(input_statea)
    proc.free_remote(input_stateb)
    proc.free_remote(input_statec)
    proc.free_remote(input_stated)
    for block in input_block:
        proc.free_remote(block)
    proc.free_remote(outputa)
    proc.free_remote(outputb)
    proc.free_remote(outputc)
    proc.free_remote(outputd)
Пример #10
0
proc = env.Processor(0)

prgm = env.Program()
code = prgm.get_stream()

inp = proc.alloc_remote('f', 4, 64)
out = proc.alloc_remote('f', 4, 64)

out.clear()
for i in xrange(0, 64):
    inp[i] = float(i + 1)

cal.set_active_code(code)

cal.dcl_input(reg.v0.x, USAGE=cal.usage.pos)
cal.dcl_resource(0, cal.pixtex_type.oned, cal.fmt.float, UNNORM=True)
cal.dcl_output(reg.o0, USAGE=cal.usage.generic)

cal.sample(0, 0, reg.o0, reg.v0.x)

prgm.set_binding(reg.i0, inp)
prgm.set_binding(reg.o0, out)

prgm.add(code)
prgm.print_code()

proc.execute(prgm)

print "inp", inp
print "out", out
Пример #11
0
def ParMD5Transform(parcontext, parblock, blocki):
  num = parcontext.number

  temp_block = extarray.extarray('I', 16*num)
  ParDecode(num, temp_block, parblock, blocki, 64)

  proc = env.Processor(0)

  N = int(math.sqrt(num/4))
  #print "N = ", N
  def address_4_1d(i, pitch=64):
    x = i % N
    y = i // 64*4
    #return x*4 + y*pitch*4*4
    return i
  def address_4_2d(x, y, pitch=64):
    return x*4 + y*pitch*4

  input_statea = proc.alloc_remote('I', 4, N, N)
  input_stateb = proc.alloc_remote('I', 4, N, N)
  input_statec = proc.alloc_remote('I', 4, N, N)
  input_stated = proc.alloc_remote('I', 4, N, N)
  input_block = proc.alloc_remote('I', 4, N*4*4, N)
  outputa = proc.alloc_remote('I', 4, N, N)
  outputb = proc.alloc_remote('I', 4, N, N)
  outputc = proc.alloc_remote('I', 4, N, N)
  outputd = proc.alloc_remote('I', 4, N, N)

  for j in range(N):
    for i in range(N):
      for k in range(4):
        input_statea[address_4_2d(i, j) + k] = parcontext.statea[k + (i + j*N)*4]
        input_stateb[address_4_2d(i, j) + k] = parcontext.stateb[k + (i + j*N)*4]
        input_statec[address_4_2d(i, j) + k] = parcontext.statec[k + (i + j*N)*4]
        input_stated[address_4_2d(i, j) + k] = parcontext.stated[k + (i + j*N)*4]
  for k in range(N):
    for j in range(0, N*4):
        for i in range(16):
          input_block[address_4_2d(j*4, k) + i] = temp_block[i + (j + k*N)*16]
          #print address_4_2d(j*4, k) + i, i + (j + k*N)*16

  #print "N = ", N
  #for i in range(num):
  #  print i, map(hex, [input_block[i*16+j] for j in range(16)])

  global xcode
  if xcode == None:
    xcode = env.InstructionStream()
    cal.set_active_code(xcode)

    S11 = xcode.acquire_register((7, 7, 7, 7))
    S12 = xcode.acquire_register((12, 12, 12, 12))
    S13 = xcode.acquire_register((17, 17, 17, 17))
    S14 = xcode.acquire_register((22, 22, 22, 22))
    S21 = xcode.acquire_register((5, 5, 5, 5))
    S22 = xcode.acquire_register((9, 9, 9, 9))
    S23 = xcode.acquire_register((14, 14, 14, 14))
    S24 = xcode.acquire_register((20, 20, 20, 20))
    S31 = xcode.acquire_register((4, 4, 4, 4))
    S32 = xcode.acquire_register((11, 11, 11, 11))
    S33 = xcode.acquire_register((16, 16, 16, 16))
    S34 = xcode.acquire_register((23, 23, 23, 23))
    S41 = xcode.acquire_register((6, 6, 6, 6))
    S42 = xcode.acquire_register((10, 10, 10, 10))
    S43 = xcode.acquire_register((15, 15, 15, 15))
    S44 = xcode.acquire_register((21, 21, 21, 21))

    a = xcode.acquire_register()
    b = xcode.acquire_register()
    c = xcode.acquire_register()
    d = xcode.acquire_register()
    # TODO: Ensure these are all contiguous - necessary for the transposes
    x = [xcode.acquire_register() for i in range(16)]
    r = xcode.acquire_register()

    cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statea
    cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stateb
    cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statec
    cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stated
    cal.dcl_resource(4, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True)
    cal.dcl_output(reg.o0, USAGE=cal.usage.generic)
    cal.dcl_output(reg.o1, USAGE=cal.usage.generic)
    cal.dcl_output(reg.o2, USAGE=cal.usage.generic)
    cal.dcl_output(reg.o3, USAGE=cal.usage.generic)

    cal.sample(0, 0, a, reg.v0.xy)
    cal.sample(1, 0, b, reg.v0.xy)
    cal.sample(2, 0, c, reg.v0.xy)
    cal.sample(3, 0, d, reg.v0.xy)

    cal.dclpi(('0', '0', '-', '-'), reg.vWinCoord0, CENTERED=True)
    buffer_index = xcode.acquire_register()
    temp = xcode.acquire_register()

    consts = xcode.acquire_register((1.0, 2.0, 3.0, 4.0))
    one = consts.x
    two = consts.y
    three = consts.z
    four = consts.w
    sixteen = xcode.acquire_register((16.0,)*4)
    cal.mov(buffer_index, reg.vWinCoord0.xy)
    cal.mul(buffer_index.x, buffer_index, sixteen)

    for i in range(4):
      cal.mov(temp.xy, buffer_index.xy)

      cal.sample(4, 0, x[i*4], buffer_index.xy)

      cal.add(buffer_index.x, buffer_index, four)
      cal.sample(4, 0, x[i*4+1], buffer_index.xy)

      cal.add(buffer_index.x, buffer_index, four)
      cal.sample(4, 0, x[i*4+2], buffer_index.xy)

      cal.add(buffer_index.x, buffer_index, four)
      cal.sample(4, 0, x[i*4+3], buffer_index.xy)

      cal.mov(buffer_index.xy, temp.xy)
      cal.add(buffer_index.x, buffer_index, one)

    cal.transpose(x[0], x[0])
    cal.transpose(x[4], x[4])
    cal.transpose(x[8], x[8])
    cal.transpose(x[12], x[12])

      # Round 1
    FF (a, b, c, d, x[ 0], S11, 0xd76aa478); # 1 
    FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); # 2 
    FF (c, d, a, b, x[ 2], S13, 0x242070db); # 3 
    FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); # 4 
    FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); # 5 
    FF (d, a, b, c, x[ 5], S12, 0x4787c62a); # 6 
    FF (c, d, a, b, x[ 6], S13, 0xa8304613); # 7 
    FF (b, c, d, a, x[ 7], S14, 0xfd469501); # 8 
    FF (a, b, c, d, x[ 8], S11, 0x698098d8); # 9 
    FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); # 10 
    FF (c, d, a, b, x[10], S13, 0xffff5bb1); # 11 
    FF (b, c, d, a, x[11], S14, 0x895cd7be); # 12 
    FF (a, b, c, d, x[12], S11, 0x6b901122); # 13 
    FF (d, a, b, c, x[13], S12, 0xfd987193); # 14 
    FF (c, d, a, b, x[14], S13, 0xa679438e); # 15 
    FF (b, c, d, a, x[15], S14, 0x49b40821); # 16 

    # Round 2 
    GG (a, b, c, d, x[ 1], S21, 0xf61e2562); # 17 
    GG (d, a, b, c, x[ 6], S22, 0xc040b340); # 18 
    GG (c, d, a, b, x[11], S23, 0x265e5a51); # 19 
    GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); # 20 
    GG (a, b, c, d, x[ 5], S21, 0xd62f105d); # 21 
    GG (d, a, b, c, x[10], S22,  0x2441453); # 22 
    GG (c, d, a, b, x[15], S23, 0xd8a1e681); # 23 
    GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); # 24 
    GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); # 25 
    GG (d, a, b, c, x[14], S22, 0xc33707d6); # 26 
    GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); # 27 
    GG (b, c, d, a, x[ 8], S24, 0x455a14ed); # 28 
    GG (a, b, c, d, x[13], S21, 0xa9e3e905); # 29 
    GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); # 30 
    GG (c, d, a, b, x[ 7], S23, 0x676f02d9); # 31 
    GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); # 32 

    # Round 3 
    HH (a, b, c, d, x[ 5], S31, 0xfffa3942); # 33 
    HH (d, a, b, c, x[ 8], S32, 0x8771f681); # 34 
    HH (c, d, a, b, x[11], S33, 0x6d9d6122); # 35 
    HH (b, c, d, a, x[14], S34, 0xfde5380c); # 36 
    HH (a, b, c, d, x[ 1], S31, 0xa4beea44); # 37 
    HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); # 38 
    HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); # 39 
    HH (b, c, d, a, x[10], S34, 0xbebfbc70); # 40 
    HH (a, b, c, d, x[13], S31, 0x289b7ec6); # 41 
    HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); # 42 
    HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); # 43 
    HH (b, c, d, a, x[ 6], S34,  0x4881d05); # 44 
    HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); # 45 
    HH (d, a, b, c, x[12], S32, 0xe6db99e5); # 46 
    HH (c, d, a, b, x[15], S33, 0x1fa27cf8); # 47 
    HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); # 48 
  
    # Round 4 
    II (a, b, c, d, x[ 0], S41, 0xf4292244); # 49 
    II (d, a, b, c, x[ 7], S42, 0x432aff97); # 50 
    II (c, d, a, b, x[14], S43, 0xab9423a7); # 51 
    II (b, c, d, a, x[ 5], S44, 0xfc93a039); # 52 
    II (a, b, c, d, x[12], S41, 0x655b59c3); # 53 
    II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); # 54 
    II (c, d, a, b, x[10], S43, 0xffeff47d); # 55 
    II (b, c, d, a, x[ 1], S44, 0x85845dd1); # 56 
    II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); # 57 
    II (d, a, b, c, x[15], S42, 0xfe2ce6e0); # 58 
    II (c, d, a, b, x[ 6], S43, 0xa3014314); # 59 
    II (b, c, d, a, x[13], S44, 0x4e0811a1); # 60 
    II (a, b, c, d, x[ 4], S41, 0xf7537e82); # 61 
    II (d, a, b, c, x[11], S42, 0xbd3af235); # 62 
    II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); # 63 
    II (b, c, d, a, x[ 9], S44, 0xeb86d391); # 64

    #cal.mov(buffer_index, reg.vWinCoord0.xy)
    #cal.mul(buffer_index.x, buffer_index, sixteen)
    #cal.add(buffer_index, buffer_index, reg.v0)
    #cal.mov('o0', buffer_index)
    cal.mov('o0', a)
    cal.mov('o1', b)
    cal.mov('o2', c)
    cal.mov('o3', d)

    xcode.release_register(a)
    xcode.release_register(b)
    xcode.release_register(c)
    xcode.release_register(d)
    for xi in x:
      xcode.release_register(xi)

    #xcode.cache_code()
    #print xcode.render_string

  xcode.set_remote_binding('i0', input_statea)
  xcode.set_remote_binding('i1', input_stateb)
  xcode.set_remote_binding('i2', input_statec)
  xcode.set_remote_binding('i3', input_stated)
  xcode.set_remote_binding('i4', input_block)
  xcode.set_remote_binding('o0', outputa)
  xcode.set_remote_binding('o1', outputb)
  xcode.set_remote_binding('o2', outputc)
  xcode.set_remote_binding('o3', outputd)

  domain = (0, 0, N, N)
  global TIME
  start_time = time.time()
  proc.execute(xcode, domain)
  end_time = time.time()
  TIME += (end_time - start_time)
  #print map(hex, [outputa[i] for i in range(4)])
  #print map(hex, outputa)
  #print outputa
  for j in range(N):
    for i in range(N):
      for k in range(4):
        parcontext.statea[k + (i + j*N)*4] += outputa[address_4_2d(i, j) + k]
        parcontext.stateb[k + (i + j*N)*4] += outputb[address_4_2d(i, j) + k]
        parcontext.statec[k + (i + j*N)*4] += outputc[address_4_2d(i, j) + k]
        parcontext.stated[k + (i + j*N)*4] += outputd[address_4_2d(i, j) + k]

  proc.free_remote(input_statea)
  proc.free_remote(input_stateb)
  proc.free_remote(input_statec)
  proc.free_remote(input_stated)
  proc.free_remote(input_block)
  proc.free_remote(outputa)
  proc.free_remote(outputb)
  proc.free_remote(outputc)
  proc.free_remote(outputd)