def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:] col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" void im2col(float* data_im, float* data_col, int bot_offset) { #pragma omp parallel for for (int index = 0; index < $global_size; index++) { int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) lib = hm_compile_and_load(im2col) im2col = lib.im2col class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] im2col.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, col_data]) + (ct.c_int, ) if len(weights.shape) > 2: weights = weights.reshape(weights.shape[0], np.prod(weights.shape[1:])) for i in range(bottom.shape[0]): im2col(bottom, col_data, i * bot_offset) top[i] = weights.dot(col_data).reshape(weights.shape[0], height_col, width_col) top[i] += bias[:, np.newaxis, np.newaxis] return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" #include <float.h> #include <math.h> void kernel_copy(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $count; index++) { out[index] = data[index]; } } void kernel_channel_max(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $num_times_spatial; index++) { int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = fmax(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } void kernel_channel_subtract(float* channel_max, float* data) { #pragma omp parallel for for (int index = 0; index < $count; index++) { int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } void kernel_exp(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $count; index++) { out[index] = exp(data[index]); } } void kernel_channel_sum(float* data, float* channel_sum) { #pragma omp parallel for for (int index = 0; index < $num_times_spatial; index++) { int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } void kernel_channel_div(float* channel_sum, float* data) { #pragma omp parallel for for (int index = 0; index < $count; index++) { int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) lib = hm_compile_and_load(kernels) copy_kern = lib.kernel_copy max_kern = lib.kernel_channel_max sub_kern = lib.kernel_channel_subtract exp_kern = lib.kernel_exp sum_kern = lib.kernel_channel_sum div_kern = lib.kernel_channel_div class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] copy_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, top]) copy_kern(bottom, top) max_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, scale]) max_kern(top, scale) sub_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [scale, top]) sub_kern(scale, top) exp_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, top]) exp_kern(top, top) sum_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, scale]) sum_kern(top, scale) div_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [scale, top]) div_kern(scale, top) return SoftmaxLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): u = symbol_table[sources[0].name] func = Template(""" #include <math.h> #define max(a,b) \ ({ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a > _b ? _a : _b; }) #define min(a,b) \ ({ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; }) void fn(float* Ix, float* v, float* It, float* Iy, float* denom, float* u, float* u_new, float* v_new) { for (int index = 0; index < $size; index++) { float _hm_generated_6; float _hm_generated_7; float _hm_generated_5; float _hm_generated_4; float _hm_generated_8; float _hm_generated_3; float ubar, vbar, t; { int x = index % $width; int y = index / $width; float accum = 0.0; accum += 0.0833333333333f * u[max(y + -1, 0) * $width + max(x + -1, 0)]; accum += 0.166666666667f * u[max(y + -1, 0) * $width + x]; accum += 0.0833333333333f * u[max(y + -1, 0) * $width + min(x + 1, $width - 1)]; accum += 0.166666666667f * u[y * $width + max(x + -1, 0)]; accum += 0.166666666667f * u[y * $width + min(x + 1, $width - 1)]; accum += 0.0833333333333f * u[min(y + 1, $height - 1) * $width + max(x + -1, 0)]; accum += 0.166666666667f * u[min(y + 1, $height - 1) * $width + x]; accum += 0.0833333333333f * u[min(y + 1, $height - 1) * $width + min(x + 1, $width - 1)]; ubar = accum; } { int x = index % $width; int y = index / $width; float accum = 0.0; accum += 0.0833333333333f * v[max(y + -1, 0) * $width + max(x + -1, 0)]; accum += 0.166666666667f * v[max(y + -1, 0) * $width + x]; accum += 0.0833333333333f * v[max(y + -1, 0) * $width + min(x + 1, $width - 1)]; accum += 0.166666666667f * v[y * $width + max(x + -1, 0)]; accum += 0.166666666667f * v[y * $width + min(x + 1, $width - 1)]; accum += 0.0833333333333f * v[min(y + 1, $height - 1) * $width + max(x + -1, 0)]; accum += 0.166666666667f * v[min(y + 1, $height - 1) * $width + x]; accum += 0.0833333333333f * v[min(y + 1, $height - 1) * $width + min(x + 1, $width - 1)]; vbar = accum; } _hm_generated_5 = Iy[index] * vbar; _hm_generated_6 = Ix[index] * ubar; _hm_generated_4 = _hm_generated_6 + _hm_generated_5; _hm_generated_3 = _hm_generated_4 + It[index]; t = _hm_generated_3 / denom[index]; _hm_generated_7 = Ix[index] * t; u_new[index] = ubar - _hm_generated_7; _hm_generated_8 = Iy[index] * t; v_new[index] = vbar - _hm_generated_8; } }""").substitute(size=np.prod(u.shape), width=u.shape[1], height=u.shape[0]) lib = hm_compile_and_load(func) fn = lib.fn class UpdateUVLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): u = symbol_table[sources[0].name] v = symbol_table[sources[1].name] Ix = symbol_table[sources[2].name] Iy = symbol_table[sources[3].name] It = symbol_table[sources[4].name] denom = symbol_table[sources[5].name] new_u = symbol_table[sinks[0].name] new_v = symbol_table[sinks[1].name] fn.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [u, v, Ix, Iy, It, denom, new_u, new_v]) fn(Ix, v, It, Iy, denom, u, new_u, new_v) return UpdateUVLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): im0 = symbol_table[sources[0].name] size = np.prod(im0.shape) func = Template(""" #include <math.h> #define max(a,b) \ ({ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a > _b ? _a : _b; }) #define min(a,b) \ ({ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; }) void fn(float* im0, float* Ix, float* Iy, float* It, float* im1, float* denom) { for (int index = 0; index < $size; index++) { float _hm_generated_2; float _hm_generated_0; float _hm_generated_1; It[index] = im1[index] - im0[index]; { int x = index % $width; int y = index / $width; float accum = 0.0; accum += -0.0833333333333f * im1[max(y + -2, 0) * $width + x]; accum += -0.666666666667f * im1[max(y + -1, 0) * $width + x]; accum += 0.666666666667f * im1[min(y + 1, $height - 1) * $width + x]; accum += 0.0833333333333f * im1[min(y + 2, $height - 1) * $width + x]; Iy[index] = accum; } { int x = index % $width; int y = index / $width; float accum = 0.0; accum += -0.0833333333333f * im1[y * $width + max(x + -2, 0)]; accum += -0.666666666667f * im1[y * $width + max(x + -1, 0)]; accum += 0.666666666667f * im1[y * $width + min(x + 1, $width - 1)]; accum += 0.0833333333333f * im1[y * $width + min(x + 2, $width - 1)]; Ix[index] = accum; } _hm_generated_1 = pow(Iy[index], 2); _hm_generated_2 = pow(Ix[index], 2); _hm_generated_0 = _hm_generated_2 + _hm_generated_1; denom[index] = _hm_generated_0 + $alpha; } }""").substitute(size=size, alpha=keywords['alpha']**2, width=im0.shape[1], height=im0.shape[0]) lib = hm_compile_and_load(func) fn = lib.fn class GradientAndDenomLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): im0 = symbol_table[sources[0].name] im1 = symbol_table[sources[1].name] It = symbol_table[sinks[0].name] Iy = symbol_table[sinks[1].name] Ix = symbol_table[sinks[2].name] denom = symbol_table[sinks[3].name] fn.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [im0, im1, It, Iy, Ix, denom]) fn(im0, Ix, Iy, It, im1, denom) return GradientAndDenomLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[ sinks[0].name].shape[1:] col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" void im2col(float* data_im, float* data_col, int bot_offset) { #pragma omp parallel for for (int index = 0; index < $global_size; index++) { int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) lib = hm_compile_and_load(im2col) im2col = lib.im2col class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] im2col.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, col_data]) + (ct.c_int, ) if len(weights.shape) > 2: weights = weights.reshape(weights.shape[0], np.prod(weights.shape[1:])) for i in range(bottom.shape[0]): im2col(bottom, col_data, i * bot_offset) top[i] = weights.dot(col_data).reshape( weights.shape[0], height_col, width_col) top[i] += bias[:, np.newaxis, np.newaxis] return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): num, channels, height, width = symbol_table[sources[0].name].shape local_size = keywords['local_size'] alpha = keywords['alpha'] k = keywords['k'] beta = keywords['beta'] compute_global = (num * channels * height * width, ) fill_global = (num * height * width, ) kernel = Template(""" #include <math.h> void LRNFillScale(float* in_global, float* scale_global) { for (int index = 0; index < $fill_global; index++) { int w = index % $width; int h = (index / $width) % $height; int n = index / $width / $height; int offset = (n * $channels * $height + h) * $width + w; int step = $height * $width; float* in = in_global + offset; float* scale = scale_global + offset; int head = 0; int pre_pad = ($local_size - 1) / 2; int post_pad = $local_size - pre_pad - 1; float accum_scale = 0; // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < $channels) { accum_scale += in[head * step] * in[head * step]; ++head; } // both add and subtract while (head < $channels) { accum_scale += in[head * step] * in[head * step]; if (head - $local_size >= 0) { accum_scale -= in[(head - $local_size) * step] * \ in[(head - $local_size) * step]; } scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size; ++head; } // subtract only while (head < $channels + post_pad) { if (head - $local_size >= 0) { accum_scale -= in[(head - $local_size) * step] * \ in[(head - $local_size) * step]; } scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size; ++head; } } } void LRNComputeOutput(float* in, float* scale, float* out) { for (int index = 0; index < $compute_global; index++) { out[index] = in[index] * pow(scale[index], (float)$negative_beta); } } """).substitute(width=width, height=height, channels=channels, local_size=local_size, alpha_over_size=float(alpha) / local_size, k=k, negative_beta=-beta, fill_global=fill_global[0], compute_global=compute_global[0]) lib = hm_compile_and_load(kernel) fill_kern = lib.LRNFillScale compute_kern = lib.LRNComputeOutput class LrnLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] scale = symbol_table[sinks[1].name] fill_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, scale]) compute_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, scale, top]) fill_kern(bottom, scale) compute_kern(bottom, scale, top) return LrnLauncher(sources, sinks)