def compile(self): if self.kernel is None: params = set(self.sources) | set(self.sinks) seen_decls = set() seen_params = set() decls = [] filtered = set() for param in params: if param.level == 'register': if param.name not in seen_decls: seen_decls.add(param.name) decls.append('float {}'.format(param.name)) else: if param.name not in seen_params: seen_params.add(param.name) filtered.add(param) self.params = list(filtered) self.params.sort(key=lambda x: x.name) params = [] sinks = set(sink.name for sink in self.sinks) for param in self.params: if param.name in sinks: str = "global float* {}".format(param.name) else: str = "global const float* {}".format(param.name) params.append(str) params_str = ", ".join(params) decls = ";\n\t\t\t".join(decls) + ";\n" kernel_name = get_unique_kernel_name() kernel = Template(""" __kernel void $name($params) { int index = get_global_id(0); if (index < $num_work_items) { $decls $body } } """).substitute(name=kernel_name, params=params_str, body=self.body, decls=decls, num_work_items=self.launch_parameters[0]) # print([p.name for p in self.params]) print(kernel) self.kernel_str = kernel kernel = cl.clCreateProgramWithSource( context, kernel).build()[kernel_name] kernel.argtypes = tuple(cl.cl_mem for _ in self.params) self.kernel = kernel
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [hmarray((channels_col, height_col * width_col)) for _ in range(len(queues))] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource( context, im2col ).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset ).on(queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[ sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [ hmarray((channels_col, height_col * width_col)) for _ in range(len(queues)) ] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource(context, im2col).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset).on( queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)