예제 #1
0
        def compile(self):
            if self.kernel is None:
                params = set(self.sources) | set(self.sinks)
                seen_decls = set()
                seen_params = set()
                decls = []
                filtered = set()
                for param in params:
                    if param.level == 'register':
                        if param.name not in seen_decls:
                            seen_decls.add(param.name)
                            decls.append('float {}'.format(param.name))
                    else:
                        if param.name not in seen_params:
                            seen_params.add(param.name)
                            filtered.add(param)
                self.params = list(filtered)
                self.params.sort(key=lambda x: x.name)
                params = []
                sinks = set(sink.name for sink in self.sinks)
                for param in self.params:
                    if param.name in sinks:
                        str = "global float* {}".format(param.name)
                    else:
                        str = "global const float* {}".format(param.name)
                    params.append(str)
                params_str = ", ".join(params)
                decls = ";\n\t\t\t".join(decls) + ";\n"
                kernel_name = get_unique_kernel_name()
                kernel = Template("""
    __kernel void $name($params) {
        int index = get_global_id(0);
        if (index < $num_work_items) {
            $decls
$body
        }
    }
        """).substitute(name=kernel_name,
                        params=params_str,
                        body=self.body,
                        decls=decls,
                        num_work_items=self.launch_parameters[0])
                # print([p.name for p in self.params])
                print(kernel)
                self.kernel_str = kernel
                kernel = cl.clCreateProgramWithSource(
                    context, kernel).build()[kernel_name]
                kernel.argtypes = tuple(cl.cl_mem for _ in self.params)
                self.kernel = kernel
예제 #2
0
        def compile(self):
            if self.kernel is None:
                params = set(self.sources) | set(self.sinks)
                seen_decls = set()
                seen_params = set()
                decls = []
                filtered = set()
                for param in params:
                    if param.level == 'register':
                        if param.name not in seen_decls:
                            seen_decls.add(param.name)
                            decls.append('float {}'.format(param.name))
                    else:
                        if param.name not in seen_params:
                            seen_params.add(param.name)
                            filtered.add(param)
                self.params = list(filtered)
                self.params.sort(key=lambda x: x.name)
                params = []
                sinks = set(sink.name for sink in self.sinks)
                for param in self.params:
                    if param.name in sinks:
                        str = "global float* {}".format(param.name)
                    else:
                        str = "global const float* {}".format(param.name)
                    params.append(str)
                params_str = ", ".join(params)
                decls = ";\n\t\t\t".join(decls) + ";\n"
                kernel_name = get_unique_kernel_name()
                kernel = Template("""
    __kernel void $name($params) {
        int index = get_global_id(0);
        if (index < $num_work_items) {
            $decls
$body
        }
    }
        """).substitute(name=kernel_name, params=params_str, body=self.body, decls=decls,
                        num_work_items=self.launch_parameters[0])
                # print([p.name for p in self.params])
                print(kernel)
                self.kernel_str = kernel
                kernel = cl.clCreateProgramWithSource(
                    context, kernel).build()[kernel_name]
                kernel.argtypes = tuple(cl.cl_mem for _ in self.params)
                self.kernel = kernel
예제 #3
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [hmarray((channels_col, height_col * width_col))
                            for _ in range(len(queues))]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size, stride_h=stride_h,
                    stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                    kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                    height=height, height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(
                context, im2col
            ).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        bottom, i * bot_offset, n, 0.0,
                                        top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                        col_datas[i % len(queues)].ocl_buf,
                                        i * bot_offset
                                        ).on(queues[i % len(queues)], (padded, ),
                                            wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        col_datas[i % len(queues)],
                                        0, n, 0.0, top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=evt)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)
            return ConvLauncher(sources, sinks)
예제 #4
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[
                sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [
                    hmarray((channels_col, height_col * width_col))
                    for _ in range(len(queues))
                ]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size,
                    stride_h=stride_h,
                    stride_w=stride_w,
                    pad_h=pad_h,
                    pad_w=pad_w,
                    kernel_h=kernel_h,
                    kernel_w=kernel_w,
                    width=width,
                    height=height,
                    height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(context,
                                                  im2col).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        bottom,
                                        i * bot_offset,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                         col_datas[i % len(queues)].ocl_buf,
                                         i * bot_offset).on(
                                             queues[i % len(queues)],
                                             (padded, ),
                                             wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        col_datas[i % len(queues)],
                                        0,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)

            return ConvLauncher(sources, sinks)