def test_forward(self): @compose def fn(bottom, label, top): top = SoftmaxForward(bottom, label) return top bottom = hm.random((6, 13, 1), _range=(-5, 5)) label = hmarray((6, )) for i in range(6): label = i % 6 top = hmarray((6, 13, 1)) fn(bottom, label, top) top.sync_host() for i in range(6): sum = 0 for c in range(13): sum += top[i, c, 0] self.assertTrue(sum > .999) self.assertTrue(sum < 1.001) scale = 0 for c in range(13): scale += exp(bottom[i, c, 0]) for c in range(13): self.assertGreaterEqual(top[i, c, 0] + 1e-4, exp(bottom[i, c, 0]) / scale) self.assertLessEqual(top[i, c, 0] - 1e-4, exp(bottom[i, c, 0]) / scale)
def set_up(self): datum = pb.Datum() datum.ParseFromString(next(self.cursor)[1]) self.mean = self.mean.reshape(datum.channels, datum.height, datum.width) height, width = datum.height, datum.width if self.crop_size: height, width = self.crop_size, self.crop_size self.data = hmarray((self.batch_size, datum.channels, height, width)) self.label = hmarray((self.batch_size, )) return [(self.data, None), (self.label, None)]
def test_simple(self): a = hm.random((3, 16, 27, 27)) scale = hmarray((3, 16, 27, 27)) actual = hmarray((3, 16, 27, 27)) @compose def fn(bottom, scale, top): top, scale = LrnForward(bottom, alpha=alpha, beta=beta, local_size=local_size, k=1) return top, scale fn(a, scale, actual) actual.sync_host() expected = reference_lrn(a) self._check(actual, expected)
def test_relu(self): top = hm.zeros((4, 12, 15, 15)) bottom = hm.random((4, 12, 15, 15), _range=(-1, 1)) @compose def fn(bottom, top): top = ReluForward(bottom) return top fn(bottom, top) top.sync_host() expected = np.copy(bottom) expected[expected < 0] = 0 self._check(top, expected) top_diff = hm.random(top.shape) bottom_diff = hmarray(top.shape) @compose def fn(top_diff, bottom, bottom_diff): bottom_diff = ReluBackward(bottom, top_diff) return bottom_diff fn(top_diff, bottom, bottom_diff) bottom_diff.sync_host() expected = np.copy(top_diff) expected[bottom < 0] = 0 self._check(bottom_diff, expected)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] bias_multiplier = hmarray((1, bottom.shape[0])) bias_multiplier.fill(1) bias_multiplier.sync_ocl() N = top.shape[1] K = np.prod(bottom.shape[1:]) M = bottom.shape[0] class InnerProductLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] evt = sgemm(False, True, 1.0, bottom, 0, K, weights, 0, K, 0.0, top, 0, N, M, N, K, wait_for=wait_for) evt = sgemm(False, False, 1.0, bias_multiplier, 0, 1, bias, 0, N, 1.0, top, 0, N, M, N, 1, wait_for=evt) return [evt] return InnerProductLauncher(sources, sinks)
def test_avg(self): shape = (3, 16, 24, 24) a = hm.random(shape, _range=(0, 255)) actual_mask = hmarray((3, 16, 12, 12)) actual = hmarray((3, 16, 12, 12)) expected = hmarray((3, 16, 12, 12)) expected.fill(float('-inf')) @compose def fn(bottom, mask, top): top = AvePoolForward(bottom, kernel_size=(2, 2), padding=(0, 0), stride=(2, 2)) return top fn(a, actual_mask, actual) actual.sync_host() reference_ave_pool(a, expected, (2, 2), (2, 2), (0, 0)) self._check(actual, expected)
def test_pool(self): shape = (3, 16, 24, 24) a = hm.random(shape, _range=(0, 255)) actual_mask = hmarray((3, 16, 12, 12)) actual = hmarray((3, 16, 12, 12)) expected_mask = hmarray((3, 16, 12, 12)) expected = hmarray((3, 16, 12, 12)) expected.fill(float('-inf')) @compose def fn(bottom, mask, top): top, mask = PoolForward(bottom, kernel_size=(2, 2), padding=(0, 0), stride=(2, 2)) return top, mask fn(a, actual_mask, actual) actual.sync_host() actual_mask.sync_host() reference_pool(a, expected, expected_mask, (2, 2), (2, 2), (0, 0)) self._check(actual, expected) self._check(actual_mask, expected_mask) bottom_diff = hm.zeros(shape) expected_bottom_diff = hm.zeros(shape) mask = actual_mask top_diff = hm.random((3, 16, 12, 12)) @compose def fn(top_diff, mask, bottom_diff): bottom_diff = PoolBackward(top_diff, mask, kernel_size=(2, 2), padding=(0, 0), stride=(2, 2)) return bottom_diff fn(top_diff, mask, bottom_diff) bottom_diff.sync_host() reference_pool_backward(top_diff, mask, expected_bottom_diff, (2, 2), (2, 2), (0, 0)) self._check(bottom_diff, expected_bottom_diff)
def reference_lrn(blob): output = hmarray(blob.shape) shape = blob.shape for n in range(shape[0]): for c in range(shape[1]): for h in range(shape[2]): for w in range(shape[3]): c_start = c - (local_size - 1) // 2 c_end = min(c_start + local_size, blob.shape[1]) c_start = max(c_start, 0) scale = 1.0 for i in range(c_start, c_end): value = blob[n, i, h, w] scale += value * value * alpha * local_size output[n, c, h, w] = \ blob[n, c, h, w] / pow(scale, beta) return output
def set_up(self, bottom, bottom_diff): self.bottom, self.bottom_diff = bottom, bottom_diff N = self.num_output K = np.prod(bottom.shape[1:]) scale = 1.0 / np.sqrt(self.num_output) if self.weights is None: self.weights = hmarray.random((N, K), _range=(-scale, scale)) self.weights_diff = hmarray.zeros((N, K)) self.weights_history = hmarray.zeros((N, K)) self.bias_diff = hmarray.zeros((self.num_output, )) self.bias_history = hmarray.zeros((self.num_output, )) self.bias_multiplier = hmarray((1, self.bottom.shape[0])) self.bias_multiplier.fill(1) self.bias_multiplier.sync_ocl() top_shape = (bottom.shape[0], N) self.top = hmarray.zeros(top_shape) self.top_diff = hmarray.zeros(top_shape) return [(self.top, self.top_diff)]
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0]].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0]].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col col2im_global_size = channels * height * width kernels = Template(""" __kernel void col2im(global float* data_col, global float* data_im, int im_offset) { if (get_global_id(0) < $col2im_global_size) { int index = get_global_id(0); float val = 0; int w = index % $width + $pad_w; int h = (index / $width) % $height + $pad_h; int c = index / ($width * $height); // compute the start and end of the output int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1; int w_col_end = min(w / $stride_w + 1, $width_col); int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1; int h_col_end = min(h / $stride_h + 1, $height_col); // equivalent implementation int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \ $height_col * $width_col; int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \ $width_col; int coeff_w_col = (1 - $stride_w * $height_col * $width_col); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; } } data_im[im_offset + index] = val; } } __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int w_out = index % $width_col; int h_index = index / $width_col; int h_out = h_index % $height_col; int channel_in = h_index / $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col, col2im_global_size=col2im_global_size) program = cl.clCreateProgramWithSource(context, kernels).build() im2col = program['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) col2im = program['col2im'] col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = [ast.Name(s, ast.Load()) for s in sources] self.sinks = [ast.Name(s, ast.Load()) for s in sinks] def compile(self): pass def launch(self, symbol_table, wait_for): queue = queues[0] bottom = symbol_table[sources[0]] bot_offset = np.prod(bottom.shape[1:]) top_diff = symbol_table[sources[1]] top_offset = np.prod(top_diff.shape[1:]) weights = symbol_table[sources[2]] bottom_diff = symbol_table[sinks[0]] bottom_diff.fill(0) bottom_diff.sync_ocl() weights_diff = symbol_table[sinks[1]] weights_diff.fill(0) weights_diff.sync_ocl() bias_diff = symbol_table[sinks[2]] bias_diff.fill(0) bias_diff.sync_ocl() for i in range(bottom.shape[0]): n = np.prod(top_diff.shape[2:]) sgemv(False, top_diff.shape[1], n, 1.0, top_diff, i * top_offset, n, bias_multiplier, 0, 1, 1.0, bias_diff, 0, 1) im2col(bottom.ocl_buf, col_data.ocl_buf, i * bot_offset).on(queue, im2col_global_size) m = top_diff.shape[1] n = col_data.shape[0] k = col_data.shape[1] sgemm(False, True, 1.0, top_diff, i * top_offset, k, col_data, 0, k, 1.0, weights_diff, 0, n, m, n, k) m = weights.shape[1] n = col_data.shape[1] k = weights.shape[0] sgemm(True, False, 1.0, weights, 0, m, top_diff, i * top_offset, n, 0.0, col_data, 0, n, m, n, k) col2im(col_data.ocl_buf, bottom_diff.ocl_buf, i * bot_offset).on(queue, col2im_global_size) return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0]] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) loss = hmarray(scale_shape) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" // @begin=cl@ __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } __kernel void SoftmaxLossForward(global const float* prob_data, global const float* label, global float* loss) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); const int n = index / $spatial_dim; const int s = index % $spatial_dim; const int label_value = (int) label[n * $spatial_dim + s]; loss[index] = -log( max(prob_data[n * $dim + label_value * $spatial_dim + s], FLT_MIN)); } } // @end=cl@ """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) loss_forward = program['SoftmaxLossForward'] loss_forward.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def compile(self): pass def launch(self, symbol_table): bottom = symbol_table[sources[0]] label = symbol_table[sources[1]] prob = symbol_table[sources[2]] top = symbol_table[sinks[0]] copy_kern(bottom.ocl_buf, prob.ocl_buf).on(queue, (count, )) max_kern(prob.ocl_buf, scale.ocl_buf).on(queue, (num * spatial_dim, )) sub_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, )) exp_kern(prob.ocl_buf, prob.ocl_buf).on(queue, (count, )) sum_kern(prob.ocl_buf, scale.ocl_buf).on(queue, (num * spatial_dim,)) div_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, )) loss_forward(prob.ocl_buf, label.ocl_buf, loss.ocl_buf).on(queue, (num * spatial_dim, )) loss.sync_host() top[0] = np.sum(loss) / np.float32(num) top.sync_ocl() return SoftmaxLauncher()
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sources def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] if count % 16: padded_count = (count + 15) & (~15) else: padded_count = count num_times_spatial = num * spatial_dim if num_times_spatial % 16: padded_num_times_spatial = (num_times_spatial + 15) & (~15) else: padded_num_times_spatial = num_times_spatial evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=wait_for) evt = max_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = sub_kern(scale.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) evt = exp_kern(top.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) evt = sum_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = div_kern(scale.ocl_buf, top.ocl_buf).on(queue, (padded_count, ), wait_for=evt) return [evt] return SoftmaxLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[ sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [ hmarray((channels_col, height_col * width_col)) for _ in range(len(queues)) ] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource(context, im2col).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset).on( queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)
conv3 = hmarray.zeros(caffe_net.blobs["conv3"].data.shape) conv4_filters = caffe_net.params["conv4"][0].data.view(hmarray) conv4_bias = caffe_net.params["conv4"][1].data.view(hmarray) conv4 = hmarray.zeros(caffe_net.blobs["conv4"].data.shape) conv5_filters = caffe_net.params["conv5"][0].data.view(hmarray) conv5_bias = caffe_net.params["conv5"][1].data.view(hmarray) conv5 = hmarray.zeros(caffe_net.blobs["conv5"].data.shape) pool5 = hmarray.zeros(caffe_net.blobs["pool5"].data.shape) pool5_mask = hmarray.zeros(pool2.shape) fc6_filters = caffe_net.params["fc6"][0].data.view(hmarray) fc6_bias = caffe_net.params["fc6"][1].data.view(hmarray) fc6_bias_multiplier = hmarray((1, pool5.shape[0])) fc6_bias_multiplier.fill(1) fc6_bias_multiplier.sync_ocl() fc6 = hmarray.zeros(caffe_net.blobs["fc6"].data.shape) fc7_filters = caffe_net.params["fc7"][0].data.view(hmarray) fc7_bias = caffe_net.params["fc7"][1].data.view(hmarray) fc7_bias_multiplier = hmarray((1, fc6.shape[0])) fc7_bias_multiplier.fill(1) fc7_bias_multiplier.sync_ocl() fc7 = hmarray.zeros(caffe_net.blobs["fc7"].data.shape) fc8_filters = caffe_net.params["fc8"][0].data.view(hmarray) fc8_bias = caffe_net.params["fc8"][1].data.view(hmarray) fc8_bias_multiplier = hmarray((1, fc7.shape[0])) fc8_bias_multiplier.fill(1)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" #include <float.h> #include <math.h> void kernel_copy(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $count; index++) { out[index] = data[index]; } } void kernel_channel_max(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $num_times_spatial; index++) { int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = fmax(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } void kernel_channel_subtract(float* channel_max, float* data) { #pragma omp parallel for for (int index = 0; index < $count; index++) { int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } void kernel_exp(float* data, float* out) { #pragma omp parallel for for (int index = 0; index < $count; index++) { out[index] = exp(data[index]); } } void kernel_channel_sum(float* data, float* channel_sum) { #pragma omp parallel for for (int index = 0; index < $num_times_spatial; index++) { int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } void kernel_channel_div(float* channel_sum, float* data) { #pragma omp parallel for for (int index = 0; index < $count; index++) { int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) lib = hm_compile_and_load(kernels) copy_kern = lib.kernel_copy max_kern = lib.kernel_channel_max sub_kern = lib.kernel_channel_subtract exp_kern = lib.kernel_exp sum_kern = lib.kernel_channel_sum div_kern = lib.kernel_channel_div class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] copy_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, top]) copy_kern(bottom, top) max_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, scale]) max_kern(top, scale) sub_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [scale, top]) sub_kern(scale, top) exp_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, top]) exp_kern(top, top) sum_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [top, scale]) sum_kern(top, scale) div_kern.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [scale, top]) div_kern(scale, top) return SoftmaxLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): bottom = symbol_table[sources[0].name] num = bottom.shape[0] channels = bottom.shape[1] scale_shape = list(bottom.shape) scale_shape[1] = 1 scale = hmarray(tuple(scale_shape)) spatial_dim = int(np.prod(bottom.shape[2:])) count = np.prod(bottom.shape) kernels = Template(""" __kernel void kernel_copy(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = data[index]; } } __kernel void kernel_channel_max(global const float* data, global float* out) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float maxval = -FLT_MAX; for (int c = 0; c < $channels; ++c) { maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval); } out[index] = maxval; } } __kernel void kernel_channel_subtract(global const float* channel_max, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] -= channel_max[n * $spatial_dim + s]; } } __kernel void kernel_exp(global const float* data, global float* out) { if (get_global_id(0) < $count) { int index = get_global_id(0); out[index] = exp(data[index]); } } __kernel void kernel_channel_sum(global const float* data, global float* channel_sum) { if (get_global_id(0) < $num_times_spatial) { int index = get_global_id(0); int n = index / $spatial_dim; int s = index % $spatial_dim; float sum = 0; for (int c = 0; c < $channels; ++c) { sum += data[(n * $channels + c) * $spatial_dim + s]; } channel_sum[index] = sum; } } __kernel void kernel_channel_div(global const float* channel_sum, global float* data) { if (get_global_id(0) < $count) { int index = get_global_id(0); int n = index / $channels / $spatial_dim; int s = index % $spatial_dim; data[index] /= channel_sum[n * $spatial_dim + s]; } } """).substitute(count=count, num_times_spatial=num * spatial_dim, channels=channels, spatial_dim=spatial_dim, dim=np.prod(bottom.shape[1:])) program = cl.clCreateProgramWithSource(context, kernels).build() copy_kern = program['kernel_copy'] copy_kern.argtypes = (cl.cl_mem, cl.cl_mem) max_kern = program['kernel_channel_max'] max_kern.argtypes = (cl.cl_mem, cl.cl_mem) sub_kern = program['kernel_channel_subtract'] sub_kern.argtypes = (cl.cl_mem, cl.cl_mem) exp_kern = program['kernel_exp'] exp_kern.argtypes = (cl.cl_mem, cl.cl_mem) sum_kern = program['kernel_channel_sum'] sum_kern.argtypes = (cl.cl_mem, cl.cl_mem) div_kern = program['kernel_channel_div'] div_kern.argtypes = (cl.cl_mem, cl.cl_mem) class SoftmaxLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sources def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] top = symbol_table[sinks[0].name] if count % 16: padded_count = (count + 15) & (~15) else: padded_count = count num_times_spatial = num * spatial_dim if num_times_spatial % 16: padded_num_times_spatial = (num_times_spatial + 15) & (~15) else: padded_num_times_spatial = num_times_spatial evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on( queue, (padded_count,), wait_for=wait_for) evt = max_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = sub_kern(scale.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) evt = exp_kern(top.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) evt = sum_kern(top.ocl_buf, scale.ocl_buf).on( queue, (padded_num_times_spatial, ), wait_for=evt) evt = div_kern(scale.ocl_buf, top.ocl_buf).on( queue, (padded_count, ), wait_for=evt) return [evt] return SoftmaxLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:] is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \ stride_w == 1 and pad_h == 0 and pad_w == 0 if not is_1x1: col_datas = [hmarray((channels_col, height_col * width_col)) for _ in range(len(queues))] bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" __kernel void im2col(global const float* data_im, global float* data_col, int bot_offset) { if (get_global_id(0) < $global_size) { int index = get_global_id(0); int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; global float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; global const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; #pragma unroll for (int i = 0; i < $kernel_h; ++i) { #pragma unroll for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) im2col = cl.clCreateProgramWithSource( context, im2col ).build()['im2col'] im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int) if im2col_global_size % 64: padded = (im2col_global_size + 63) & (~63) else: padded = im2col_global_size class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] top_offset = np.prod(top.shape[1:]) m = weights.shape[0] n = np.prod(top.shape[2:]) k = np.prod(weights.shape[1:]) # cl.clFinish(queues[0]) evts = [] if is_1x1: for i in range(bottom.shape[0]): evt = sgemm(False, False, 1.0, weights, 0, k, bottom, i * bot_offset, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=wait_for) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) else: for i in range(bottom.shape[0]): evt = im2col(bottom.ocl_buf, col_datas[i % len(queues)].ocl_buf, i * bot_offset ).on(queues[i % len(queues)], (padded, ), wait_for=wait_for) evt = sgemm(False, False, 1.0, weights, 0, k, col_datas[i % len(queues)], 0, n, 0.0, top, i * top_offset, n, m, n, k, queues[i % len(queues)], wait_for=evt) evt = sgemm(False, False, 1.0, bias, 0, 1, bias_multiplier, 0, n, 1.0, top, i * top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt) evts.append(evt) return evts # for q in queues: # cl.clFinish(q) return ConvLauncher(sources, sinks)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:] col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" void im2col(float* data_im, float* data_col, int bot_offset) { #pragma omp parallel for for (int index = 0; index < $global_size; index++) { int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) lib = hm_compile_and_load(im2col) im2col = lib.im2col class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] im2col.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, col_data]) + (ct.c_int, ) if len(weights.shape) > 2: weights = weights.reshape(weights.shape[0], np.prod(weights.shape[1:])) for i in range(bottom.shape[0]): im2col(bottom, col_data, i * bot_offset) top[i] = weights.dot(col_data).reshape(weights.shape[0], height_col, width_col) top[i] += bias[:, np.newaxis, np.newaxis] return ConvLauncher(sources, sinks)
conv3 = hmarray.zeros(caffe_net.blobs['conv3'].data.shape) conv4_filters = caffe_net.params['conv4'][0].data.view(hmarray) conv4_bias = caffe_net.params['conv4'][1].data.view(hmarray) conv4 = hmarray.zeros(caffe_net.blobs['conv4'].data.shape) conv5_filters = caffe_net.params['conv5'][0].data.view(hmarray) conv5_bias = caffe_net.params['conv5'][1].data.view(hmarray) conv5 = hmarray.zeros(caffe_net.blobs['conv5'].data.shape) pool5 = hmarray.zeros(caffe_net.blobs['pool5'].data.shape) pool5_mask = hmarray.zeros(pool5.shape) fc6_filters = caffe_net.params['fc6'][0].data.view(hmarray) fc6_bias = caffe_net.params['fc6'][1].data.view(hmarray) fc6_bias_multiplier = hmarray((1, pool5.shape[0])) fc6_bias_multiplier.fill(1) fc6 = hmarray.zeros(caffe_net.blobs['fc6'].data.shape) fc7_filters = caffe_net.params['fc7'][0].data.view(hmarray) fc7_bias = caffe_net.params['fc7'][1].data.view(hmarray) fc7_bias_multiplier = hmarray((1, fc6.shape[0])) fc7_bias_multiplier.fill(1) fc7 = hmarray.zeros(caffe_net.blobs['fc7'].data.shape) fc8_filters = caffe_net.params['fc8'][0].data.view(hmarray) fc8_bias = caffe_net.params['fc8'][1].data.view(hmarray) fc8_bias_multiplier = hmarray((1, fc7.shape[0])) fc8_bias_multiplier.fill(1) fc8 = hmarray.zeros(caffe_net.blobs['fc8'].data.shape)
def get_launcher(cls, sources, sinks, keywords, symbol_table): kernel_h, kernel_w = keywords['kernel_size'] pad_h, pad_w = keywords['padding'] stride_h, stride_w = keywords['stride'] num, channels, height, width = symbol_table[sources[0].name].shape channels_col = channels * kernel_h * kernel_w height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1 width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1 out_channels, height_col, width_col = symbol_table[ sinks[0].name].shape[1:] col_data = hmarray((channels_col, height_col * width_col)) bias_multiplier = hmarray( (1, np.prod(symbol_table[sinks[0].name].shape[2:]))) bias_multiplier.fill(1.0) bias_multiplier.sync_ocl() im2col_global_size = channels * height_col * width_col im2col = Template(""" void im2col(float* data_im, float* data_col, int bot_offset) { #pragma omp parallel for for (int index = 0; index < $global_size; index++) { int h_index = index / $width_col; int w_out = index - h_index * $width_col; int channel_in = h_index / $height_col; int h_out = h_index - channel_in * $height_col; int channel_out = channel_in * $kernel_h * $kernel_w; int h_in = h_out * $stride_h - $pad_h; int w_in = w_out * $stride_w - $pad_w; float* data_col_ptr = data_col; data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out; const float* data_im_ptr = data_im + bot_offset; data_im_ptr += (channel_in * $height + h_in) * $width + w_in; for (int i = 0; i < $kernel_h; ++i) { for (int j = 0; j < $kernel_w; ++j) { int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ? data_im_ptr[i * $width + j] : 0; data_col_ptr += $height_col * $width_col; } } } } """).substitute(global_size=im2col_global_size, stride_h=stride_h, stride_w=stride_w, pad_h=pad_h, pad_w=pad_w, kernel_h=kernel_h, kernel_w=kernel_w, width=width, height=height, height_col=height_col, width_col=width_col) lib = hm_compile_and_load(im2col) im2col = lib.im2col class ConvLauncher(object): def __init__(self, sources, sinks): self.sources = sources self.sinks = sinks def compile(self): pass def launch(self, symbol_table, wait_for): bottom = symbol_table[sources[0].name] bot_offset = np.prod(bottom.shape[1:]) weights = symbol_table[sources[1].name] bias = symbol_table[sources[2].name] top = symbol_table[sinks[0].name] im2col.argtypes = tuple( np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in [bottom, col_data]) + (ct.c_int, ) if len(weights.shape) > 2: weights = weights.reshape(weights.shape[0], np.prod(weights.shape[1:])) for i in range(bottom.shape[0]): im2col(bottom, col_data, i * bot_offset) top[i] = weights.dot(col_data).reshape( weights.shape[0], height_col, width_col) top[i] += bias[:, np.newaxis, np.newaxis] return ConvLauncher(sources, sinks)