예제 #1
0
 def shuffle(self, queue, dst, src):
     call_cl_kernel(
         self.kernel, queue,
         self.grid, self.block,
         dst, src,
         self.BC, self.C,
         self.AB, self.A)
예제 #2
0
def calcV(I_shape, I_cl, V_cl):
    #Ifull = I
    Ci = I_shape[0]
    iH = I_shape[1]
    iW = I_shape[2]
    N = I_shape[3]
    tiles = iW // 4

    oH = iH
    oW = iW
    padH = 1
    padW = 1

    # adapted from winograd_conv.py
    #if N == 1:
    #    shlN = 0
    #elif N < 32:
    #    shlN = len(bin(N-1))-2
    #else:
    #    shlN = 5
    shlN = 5
    shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = {
        0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8  yyxxx
        1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4  yyxxn
        2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4  yxxnn
        3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4  xxnnn
        4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2  xnnnn
        5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1  nnnnn
    }.get(shlN)

    GYS  = ceil_div(oH, 1 << shlY)
    GXS  = ceil_div(oW, 1 << shlX)
    GN   = ceil_div(N, 1 << shlN)
    # GK   = ceil_div(Co, 32)
    GYS2 = GYS // 2
    GXS2 = GXS  * 2

    div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2)
    div_GXS = get_div_mul_shift_32(GXS * GYS, GXS)

    image_size = 1152*Ci*GXS*GYS*GN
    
    print('div_GXS', div_GXS)

    print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS)
    grid = (GN, GYS*GXS, Ci)
    block = (32, 1, 1)

    call_cl_kernel(
        k_calcV,
        q, grid, block,
        V_cl, I_cl,
        
        iH, iW, N, padH, padW,
        GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1],
        shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN,
        iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152,
        GXS, GXS * GYS, GN, Ci)
    q.finish()
    timecheck('calced V_cl')
    def execute(self, q, repeat=1, unbind=True):
        for r in range(repeat):
            cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4)
            call_cl_kernel(self.kernel, q, *self.launch_args)

        if unbind:
            self.zero_args = self.convert_args = None
            self.launch_args[2:7] = (None,) * 5
예제 #4
0
    def execute(self, q, repeat=1, unbind=True):
        for r in range(repeat):
            cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0,
                                   self.zero_args[2] * 4)
            call_cl_kernel(self.kernel, q, *self.launch_args)

        if unbind:
            self.zero_args = self.convert_args = None
            self.launch_args[2:7] = (None, ) * 5
예제 #5
0
 def execute(self, q, repeat=1, unbind=True):
     C = self.shuffle_args[12]
     assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel"
     for r in range(repeat):
         # call_cl_kernel(self.shuffleKernel, self.lib.q, *self.shuffle_args)
         call_cl_kernel(self.kernel, q, *self.launch_args)
     if unbind:
         # self.shuffle_args[2:4] = (None,) * 2
         self.launch_args[2:7] = (None, ) * 5
 def execute(self, q, repeat=1, unbind=True):
     C = self.shuffle_args[12]
     assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel"
     for r in range(repeat):
         # call_cl_kernel(self.shuffleKernel, self.lib.q, *self.shuffle_args)
         call_cl_kernel(self.kernel, q, *self.launch_args)
     if unbind:
         # self.shuffle_args[2:4] = (None,) * 2
         self.launch_args[2:7] = (None,) * 5
예제 #7
0
def calcO(O_cl, M_shape, M_cl):
    GK = M_shape[2]
    GN = M_shape[0]
    tiles = M_shape[4]

    num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles
    grid = (ceil_div(num_xinu_tiles, 32), 1, 1)
    block = (32, 1, 1)

    call_cl_kernel(
        k_calcO,
        q, grid, block,
        O_cl, M_cl,
        num_xinu_tiles
    )
    q.finish()
    timecheck('calced O_cl')
예제 #8
0
def calcM(N, Co, M_cl, U_shape, U_cl, V_shape, V_cl):
    Co = (U_shape[2] - 1) * 32 + U_shape[4]
    Ci = U_shape[3]
    GK   = ceil_div(Co, 32)
    tiles = V_shape[4]
    GN = V_shape[2]
    print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N)

    grid = (tiles * tiles,1,1) # b
    block = (32, 16, 1)  # 16 for intel...

    call_cl_kernel(
        k_calcM,
        q, grid, block,
        M_cl, U_cl, V_cl,
        
        Ci, 1, tiles, GN, GK,
        cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4))
    q.finish()
    timecheck('calced M_cl')
예제 #9
0
def calcU(q, W_shape, W_cl, U_cl):
    Ci = W_shape[0]
    kH = W_shape[1]
    kW = W_shape[2]
    Co = W_shape[3]

    # this is adapted from neon's winograd_conv.py:
    GK   = ceil_div(Co, 32)

    filter_size   = 1152*Ci*GK
    grid = (GK, Ci, 1)
    block = (32, 1, 1)
    
    call_cl_kernel(
        k_calcU,
        q, grid, block,
        U_cl, W_cl,
        kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152,
        Ci, GK)
    q.finish()
    timecheck('calced U_cl')
예제 #10
0
 def shuffle(self, q, Wt, W):
     self.shuffle_args[2:4] = (Wt, W)
     call_cl_kernel(self.shuffleKernel, q, *self.shuffle_args)
예제 #11
0
 def execute(self, q, repeat=1, unbind=True):
     for r in range(repeat):
         call_cl_kernel(self.kernel, q, *self.launch_args)
     if unbind:
         self.launch_args[2:7] = (None, ) * 5
 def shuffle(self, q, Wt, W):
     self.shuffle_args[2:4] = (Wt, W)
     call_cl_kernel(self.shuffleKernel, q, *self.shuffle_args)
 def execute(self, q, repeat=1, unbind=True):
     for r in range(repeat):
         call_cl_kernel(self.kernel, q, *self.launch_args)
     if unbind:
         self.launch_args[2:7] = (None,) * 5