def shuffle(self, queue, dst, src): call_cl_kernel( self.kernel, queue, self.grid, self.block, dst, src, self.BC, self.C, self.AB, self.A)
def calcV(I_shape, I_cl, V_cl): #Ifull = I Ci = I_shape[0] iH = I_shape[1] iW = I_shape[2] N = I_shape[3] tiles = iW // 4 oH = iH oW = iW padH = 1 padW = 1 # adapted from winograd_conv.py #if N == 1: # shlN = 0 #elif N < 32: # shlN = len(bin(N-1))-2 #else: # shlN = 5 shlN = 5 shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = { 0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx 1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn 2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn 3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn 4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn 5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn }.get(shlN) GYS = ceil_div(oH, 1 << shlY) GXS = ceil_div(oW, 1 << shlX) GN = ceil_div(N, 1 << shlN) # GK = ceil_div(Co, 32) GYS2 = GYS // 2 GXS2 = GXS * 2 div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2) div_GXS = get_div_mul_shift_32(GXS * GYS, GXS) image_size = 1152*Ci*GXS*GYS*GN print('div_GXS', div_GXS) print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS) grid = (GN, GYS*GXS, Ci) block = (32, 1, 1) call_cl_kernel( k_calcV, q, grid, block, V_cl, I_cl, iH, iW, N, padH, padW, GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1], shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN, iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152, GXS, GXS * GYS, GN, Ci) q.finish() timecheck('calced V_cl')
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.zero_args = self.convert_args = None self.launch_args[2:7] = (None,) * 5
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): cl.enqueue_fill_buffer(q, self.zero_args[0], np.float32(0), 0, self.zero_args[2] * 4) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.zero_args = self.convert_args = None self.launch_args[2:7] = (None, ) * 5
def execute(self, q, repeat=1, unbind=True): C = self.shuffle_args[12] assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel" for r in range(repeat): # call_cl_kernel(self.shuffleKernel, self.lib.q, *self.shuffle_args) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: # self.shuffle_args[2:4] = (None,) * 2 self.launch_args[2:7] = (None, ) * 5
def execute(self, q, repeat=1, unbind=True): C = self.shuffle_args[12] assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel" for r in range(repeat): # call_cl_kernel(self.shuffleKernel, self.lib.q, *self.shuffle_args) call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: # self.shuffle_args[2:4] = (None,) * 2 self.launch_args[2:7] = (None,) * 5
def calcO(O_cl, M_shape, M_cl): GK = M_shape[2] GN = M_shape[0] tiles = M_shape[4] num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles grid = (ceil_div(num_xinu_tiles, 32), 1, 1) block = (32, 1, 1) call_cl_kernel( k_calcO, q, grid, block, O_cl, M_cl, num_xinu_tiles ) q.finish() timecheck('calced O_cl')
def calcM(N, Co, M_cl, U_shape, U_cl, V_shape, V_cl): Co = (U_shape[2] - 1) * 32 + U_shape[4] Ci = U_shape[3] GK = ceil_div(Co, 32) tiles = V_shape[4] GN = V_shape[2] print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N) grid = (tiles * tiles,1,1) # b block = (32, 16, 1) # 16 for intel... call_cl_kernel( k_calcM, q, grid, block, M_cl, U_cl, V_cl, Ci, 1, tiles, GN, GK, cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4)) q.finish() timecheck('calced M_cl')
def calcU(q, W_shape, W_cl, U_cl): Ci = W_shape[0] kH = W_shape[1] kW = W_shape[2] Co = W_shape[3] # this is adapted from neon's winograd_conv.py: GK = ceil_div(Co, 32) filter_size = 1152*Ci*GK grid = (GK, Ci, 1) block = (32, 1, 1) call_cl_kernel( k_calcU, q, grid, block, U_cl, W_cl, kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152, Ci, GK) q.finish() timecheck('calced U_cl')
def shuffle(self, q, Wt, W): self.shuffle_args[2:4] = (Wt, W) call_cl_kernel(self.shuffleKernel, q, *self.shuffle_args)
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.launch_args[2:7] = (None, ) * 5
def shuffle(self, q, Wt, W): self.shuffle_args[2:4] = (Wt, W) call_cl_kernel(self.shuffleKernel, q, *self.shuffle_args)
def execute(self, q, repeat=1, unbind=True): for r in range(repeat): call_cl_kernel(self.kernel, q, *self.launch_args) if unbind: self.launch_args[2:7] = (None,) * 5