Пример #1
0
    def k_loop_simple(self, code):
        kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
        a, b, c = self.a, self.b, self.c
        p_tA, p_tB = self.p_tA, self.p_tB

        # Inner loop over k
        for k in syn_iter(code, kc,
                          mode=CTR):  # syn_range(code, 0, kc * 8, 8):
            # Load the next values from tA and tB -- generating loops
            for ai in range(mr):
                a[ai].load(p_tA, ai * self.A_strides.row)

            for bj in range(nr):
                b[bj].load(p_tB, bj * self.B_strides.col)

            # Update c -- generating loop
            for ci in range(mr):
                for cj in range(nr):
                    c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

            p_tA.v = p_tA + self.A_strides.col
            p_tB.v = p_tB + self.B_strides.row
        # /end for k

        return
Пример #2
0
  def _pack_b(self, code):
    kc, nc, = self.block.kc, self.block.nc
    
    vb, vB, vtB, vBi, bij, tbji, vN = (
      self.vb, self.vB, self.vtB, self.vBi, self.bij, self.tbji, self.vN)

    
    for i in syn_iter(code, kc):
      vBi.v = i * vN
      
      for j in syn_iter(code, nc):
        bij.v  = (vBi + j) * 8
        tbji.v = (j * kc + i) * 8
        vb.load(vB, bij)
        vb.store(vtB, tbji)        

    return
Пример #3
0
Файл: gemm.py Проект: tmaone/efi
  def _pack_b(self, code):
    kc, nc, = self.block.kc, self.block.nc
    
    vb, vB, vtB, vBi, bij, tbji, vN = (
      self.vb, self.vB, self.vtB, self.vBi, self.bij, self.tbji, self.vN)

    
    for i in syn_iter(code, kc):
      vBi.v = i * vN
      
      for j in syn_iter(code, nc):
        bij.v  = (vBi + j) * 8
        tbji.v = (j * kc + i) * 8
        vb.load(vB, bij)
        vb.store(vtB, tbji)        

    return
Пример #4
0
  def k_loop_prefetch_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Load the next values from tA and tB 
    for ai in range(mr):
      a[ai].load(p_tA, ai * self.A_strides.row)
      
    for bj in range(nr):
      b[bj].load(p_tB, bj * self.B_strides.col)

    p_tA.v = p_tA + self.A_strides.col
    p_tB.v = p_tB + self.B_strides.row
        

    # Inner loop over k
    for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8):

      # Iteration 1 -- load [a,b]_pre, compute [a,b]
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

      # Iteration 2l -- oad [a,b], compute [a,b]_pre
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj])
          
    # /end for k

    return
Пример #5
0
Файл: gemm.py Проект: tmaone/efi
  def k_loop_prefetch_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Load the next values from tA and tB 
    for ai in range(mr):
      a[ai].load(p_tA, ai * self.A_strides.row)
      
    for bj in range(nr):
      b[bj].load(p_tB, bj * self.B_strides.col)

    p_tA.v = p_tA + self.A_strides.col
    p_tB.v = p_tB + self.B_strides.row
        

    # Inner loop over k
    for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8):

      # Iteration 1 -- load [a,b]_pre, compute [a,b]
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

      # Iteration 2l -- oad [a,b], compute [a,b]_pre
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj])
          
    # /end for k

    return
Пример #6
0
  def synthesize(self, prgm, tB, M, K, N, kc, nc, mr = 1, nr = 1):
    code = prgm.get_stream()

    old_code = ppc.get_active_code()
    ppc.set_active_code(code)

    gepb  = SynGEPB(self.gepb_mode)
    packb = SynPackB()

    gepb._init_constants(M, K, N, kc, nc, mr, nr, True)
    packb._init_constants(prgm, tB, N)

    gepb._init_vars()

    # Reuse the C/C_aux registers for B.  They are set in init pointers.
    packb._init_vars2(gepb.p_C, gepb.c[0][0], gepb.r_tB_addr)
    
    gepb._load_params()
    packb._load_params(pvB = 7)

    # kN = k * N * 8
    # for j in range(0, N * 8, nc * 8):
    for j in syn_iter(code, N, nc):
      # # Pack B into tB -- tB1.transpose(B[k:k+kc, j:j+nc])
      # pack_params.p1 = B_addr + kN + j # (k * N + j) * 8      

      packb.vN.v = N
      packb._pack_b(code)

      # proc.execute(cgepb, params = pm)
      gepb._init_pointers()
      gepb._gepb(code)

      # pm.p3 += nc8      
      gepb.r_C_addr.v = gepb.r_C_addr + nc * 8

      packb.vB.v = packb.vB + nc * 8      

    # /end for j

    ppc.set_active_code(old_code)
    return 
Пример #7
0
    def synthesize(self, prgm, tB, M, K, N, kc, nc, mr=1, nr=1):
        code = prgm.get_stream()

        old_code = ppc.get_active_code()
        ppc.set_active_code(code)

        gepb = SynGEPB(self.gepb_mode)
        packb = SynPackB()

        gepb._init_constants(M, K, N, kc, nc, mr, nr, True)
        packb._init_constants(prgm, tB, N)

        gepb._init_vars()

        # Reuse the C/C_aux registers for B.  They are set in init pointers.
        packb._init_vars2(gepb.p_C, gepb.c[0][0], gepb.r_tB_addr)

        gepb._load_params()
        packb._load_params(pvB=7)

        # kN = k * N * 8
        # for j in range(0, N * 8, nc * 8):
        for j in syn_iter(code, N, nc):
            # # Pack B into tB -- tB1.transpose(B[k:k+kc, j:j+nc])
            # pack_params.p1 = B_addr + kN + j # (k * N + j) * 8

            packb.vN.v = N
            packb._pack_b(code)

            # proc.execute(cgepb, params = pm)
            gepb._init_pointers()
            gepb._gepb(code)

            # pm.p3 += nc8
            gepb.r_C_addr.v = gepb.r_C_addr + nc * 8

            packb.vB.v = packb.vB + nc * 8

        # /end for j

        ppc.set_active_code(old_code)
        return
Пример #8
0
  def k_loop_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    p_tA, p_tB = self.p_tA, self.p_tB
    
    # Inner loop over k
    for k in syn_iter(code, kc, mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # Load the next values from tA and tB -- generating loops
      for ai in range(mr):
        a[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b[bj].load(p_tB, bj * self.B_strides.col)
    
      # Update c -- generating loop
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])
          
      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row
    # /end for k

    return
Пример #9
0
  def k_loop_prefetch(self, code):
    A_row_stride = self.A_strides.row
    A_col_stride = self.A_strides.col

    B_row_stride = self.B_strides.row
    B_col_stride = self.B_strides.col

    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Increment p_tA, p_tB
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)
    b[1].load(p_tB, 1 * B_col_stride)                    
    a[1].load(p_tA, 1 * A_row_stride)
    
    b[2].load(p_tB, 2 * B_col_stride)          
    b[3].load(p_tB, 3 * B_col_stride)
    
    a[2].load(p_tA, 2 * A_row_stride)
    a[3].load(p_tA, 3 * A_row_stride)
    
    p_tA.v = p_tA + A_col_stride
    p_tB.v = p_tB + B_row_stride
    
    for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride)
      
      a_pre[0].load(p_tA, 0 * A_row_stride)
      b_pre[1].load(p_tB, 1 * B_col_stride)
      b_pre[0].load(p_tB, 0 * B_col_stride)          
      a_pre[1].load(p_tA, 1 * A_row_stride)
      
      b_pre[2].load(p_tB, 2 * B_col_stride)
      a_pre[2].load(p_tA, 2 * A_row_stride)

      b_pre[3].load(p_tB, 3 * B_col_stride)          
      a_pre[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
      
      a[0].load(p_tA, 0 * A_row_stride)
      b[1].load(p_tB, 1 * B_col_stride)
      b[0].load(p_tB, 0 * B_col_stride)
      a[1].load(p_tA, 1 * A_row_stride)
      
      b[2].load(p_tB, 2 * B_col_stride)
      b[3].load(p_tB, 3 * B_col_stride)
      
      a[2].load(p_tA, 2 * A_row_stride)
      a[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3])
      
      # /end for k
    return
Пример #10
0
Файл: gemm.py Проект: tmaone/efi
  def k_loop_prefetch(self, code):
    A_row_stride = self.A_strides.row
    A_col_stride = self.A_strides.col

    B_row_stride = self.B_strides.row
    B_col_stride = self.B_strides.col

    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Increment p_tA, p_tB
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)
    b[1].load(p_tB, 1 * B_col_stride)                    
    a[1].load(p_tA, 1 * A_row_stride)
    
    b[2].load(p_tB, 2 * B_col_stride)          
    b[3].load(p_tB, 3 * B_col_stride)
    
    a[2].load(p_tA, 2 * A_row_stride)
    a[3].load(p_tA, 3 * A_row_stride)
    
    p_tA.v = p_tA + A_col_stride
    p_tB.v = p_tB + B_row_stride
    
    for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride)
      
      a_pre[0].load(p_tA, 0 * A_row_stride)
      b_pre[1].load(p_tB, 1 * B_col_stride)
      b_pre[0].load(p_tB, 0 * B_col_stride)          
      a_pre[1].load(p_tA, 1 * A_row_stride)
      
      b_pre[2].load(p_tB, 2 * B_col_stride)
      a_pre[2].load(p_tA, 2 * A_row_stride)

      b_pre[3].load(p_tB, 3 * B_col_stride)          
      a_pre[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
      
      a[0].load(p_tA, 0 * A_row_stride)
      b[1].load(p_tB, 1 * B_col_stride)
      b[0].load(p_tB, 0 * B_col_stride)
      a[1].load(p_tA, 1 * A_row_stride)
      
      b[2].load(p_tB, 2 * B_col_stride)
      b[3].load(p_tB, 3 * B_col_stride)
      
      a[2].load(p_tA, 2 * A_row_stride)
      a[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3])
      
      # /end for k
    return