def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices=devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int,
                                 pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue,
                                                     data['in_r'],
                                                     blocking=blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue,
                                                     data['in_g'],
                                                     blocking=blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue,
                                                     data['in_b'],
                                                     blocking=blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue,
                                                      out_r,
                                                      blocking=blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue,
                                                      score,
                                                      blocking=blocking)

    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf,
            in_g_buf,
            in_b_buf,
            in_r_buf,
            score_buf,
            len(data['in_r']),
            cube_size,
            wait_for=[in_evt1, in_evt2, in_evt3, in_evt4,
                      in_evt5]).on(queue, sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue,
                                                     score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
示例#2
0
 def finalize(self, tree, program_config):
     arg_cfg = program_config[0]
     fn = WarpImg2DConcreteOcl()
     program = clCreateProgramWithSource(fn.context,
                                         tree.codegen()).build()
     ptr = program[self.entry_point]
     return fn.finalize(ptr, arg_cfg[0][1])
示例#3
0
 def _gen_reduce_for_loop(self, loop, var, size):
     looplen1 = loop.test.right
     loopincr = loop.incr.value.value
     kernel_name = self._gen_unique_kernel_name()
     kernel_src = StringTemplate("""
       __kernel void $kernel_name(__global float * $arr) {
         int x = get_global_id(0);
         float sum = $arr[x];
         #pragma unroll
         for (int i = 1; i < $batch_size; ++ i) {
           sum += $arr[i * $size + x];
         }
         $arr[x] = sum;
       }
     """, {'batch_size': C.Constant(self.batch_size),
           'arr': C.SymbolRef(var),
           'size': C.Constant(size),
           'kernel_name': C.SymbolRef(kernel_name)})
     program = cl.clCreateProgramWithSource(
         latte.config.cl_ctx, kernel_src.codegen()).build()
     kernel = program[kernel_name]
     self.kernels[kernel_name] = kernel
     kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem))
     return StringTemplate(
         """
         size_t global_size_{kernel_name}[1] = {{{looplen1}}};
         clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL);
         clFinish(queue);
         """.format(
             kernel_name=kernel_name, 
             looplen1=size)
     )
示例#4
0
    def finalize(self, tree, program_config):
        arg_cfg, tune_cfg = program_config
        param_types = [
            np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape)
            for arg in arg_cfg + (self.output, )
        ]

        if self.backend == StencilOclTransformer:
            param_types.append(param_types[0])
            fn = OclStencilFunction()
            program = clCreateProgramWithSource(fn.context,
                                                tree.codegen()).build()
            stencil_kernel_ptr = program['stencil_kernel']
            global_size = tuple(dim - 2 * self.kernel.ghost_depth
                                for dim in arg_cfg[0].shape)
            finalized = fn.finalize(stencil_kernel_ptr, global_size,
                                    self.kernel.ghost_depth, self.output)
        else:
            param_types.append(POINTER(c_float))
            kernel_sig = CFUNCTYPE(c_void_p, *param_types)
            fn = StencilFunction()
            finalized = fn.finalize(tree, "stencil_kernel", kernel_sig,
                                    self.output)
        self.output = None
        return finalized
示例#5
0
文件: pyr_up.py 项目: lowks/hindemith
 def finalize(self, tree, program_config):
     arg_cfg, tune_cfg = program_config
     len_x = arg_cfg[0][1][0]
     len_y = arg_cfg[0][1][1]
     fn = OclFunc2()
     program = clCreateProgramWithSource(fn.context, tree.codegen()).build()
     ptr = program[self.entry_point]
     return fn.finalize(ptr, (len_x, len_y))
示例#6
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        bottom_diff = symbol_table[sinks[0]]
        count = np.prod(bottom_diff.shape)
        num = bottom_diff.shape[0]
        spatial_dim = int(np.prod(bottom_diff.shape[2:]))
        kernels = Template("""
__kernel void kernel_copy(global const float* data,
                                 global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] = data[index];
  }
}
__kernel void kernel_scale(float scale, global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] *= scale;
  }
}
__kernel void SoftmaxLossBackwardGPU(global const float* label,
    global float* bottom_diff) {
  int index = get_global_id(0);
  if (index < $global_size) {
    const int n = index / $spatial_dim;
    const int s = index % $spatial_dim;
    const int label_value = (int)label[n * $spatial_dim + s];
    bottom_diff[n * $dim + label_value * $spatial_dim + s] -= 1;
  }
}""").substitute(count=count, spatial_dim=spatial_dim,
                 dim=np.prod(bottom_diff.shape[1:]),
                 global_size=num*spatial_dim)

        program = cl.clCreateProgramWithSource(context, kernels).build()
        copy = program['kernel_copy']
        copy.argtypes = (cl.cl_mem, cl.cl_mem)
        scale = program['kernel_scale']
        scale.argtypes = (cl.cl_float, cl.cl_mem)
        backward = program['SoftmaxLossBackwardGPU']
        backward.argtypes = (cl.cl_mem, cl.cl_mem)

        class Launcher(object):
            def compile(self):
                pass

            def launch(self, symbol_table):
                bottom_diff = symbol_table[sinks[0]]
                # top_diff = symbol_table[sources[0]]
                label = symbol_table[sources[1]]
                prob = symbol_table[sources[2]]
                copy(prob.ocl_buf, bottom_diff.ocl_buf).on(
                    queue, (np.prod(prob.shape),))
                backward(label.ocl_buf, bottom_diff.ocl_buf).on(
                    queue, (num * spatial_dim), )
                loss_weight = 1.0
                scale(np.float32(loss_weight / float(num)),
                      bottom_diff.ocl_buf).on(queue, (np.prod(prob.shape), ))
        return Launcher()
示例#7
0
 def finalize(self, tree, program_config):
     arg_cfg, tune_cfg = program_config
     fn = OclFunc()
     program = clCreateProgramWithSource(
         fn.context, tree.codegen()).build()
     ptr = program[self.entry_point]
     return fn.finalize(
         ptr, (int(arg_cfg[0][2][1] / 2), int(arg_cfg[0][2][0] / 2)), self.output_name
     )
示例#8
0
    def transform(self, py_ast, program_config):
        """
        Convert the Python AST to a C AST according to the directions
        given in program_config.
        """
        A = program_config[0]
        len_A = np.prod(A._shape_)
        inner_type = A._dtype_.type()
        # browser_show_ast(py_ast,'tmp.png')
        apply_one = PyBasicConversions().visit(py_ast.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type

        apply_kernel = FunctionDecl(
            None,
            "apply_kernel",
            params=[SymbolRef("A", A()).set_global()],
            defn=[
                Assign(SymbolRef("i", ct.c_int()), get_global_id(0)),
                If(Lt(SymbolRef("i"), Constant(len_A)), [
                    Assign(
                        ArrayRef(SymbolRef("A"), SymbolRef("i")),
                        FunctionCall(
                            SymbolRef("apply"),
                            [ArrayRef(SymbolRef("A"), SymbolRef("i"))])),
                ], []),
            ]).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(
            r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif
        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf) {
            size_t global = $n;
            size_t local = 32;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);

        }
        """, {'n': Constant(len_A + 32 - (len_A % 32))})

        proj = Project([kernel, CFile("generated", [control])])
        fn = OpFunction()

        program = cl.clCreateProgramWithSource(fn.context,
                                               kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel,
                                  cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
示例#9
0
    def finalize(self, transform_result, program_config):
        ocl_kernel = transform_result[0]
        c_controller = transform_result[1]
        proj = Project([ocl_kernel, c_controller])
        fn = ConcreteReduction()                        # define the ConcreteSpecializeFunction subclass to use

        program = cl.clCreateProgramWithSource(fn.context, ocl_kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']
        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
示例#10
0
    def finalize(self, transform_result, program_config):
        kernel, cfile = transform_result
        proj = Project([kernel, cfile])
        fn = OpFunction()

        program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
示例#11
0
文件: core.py 项目: lowks/hindemith
    def finalize(self, tree, entry_type, entry_point):
        fn = DLAConcreteOCL(self.output)
        self.output = None
        self.fusable_nodes = []
        kernel = tree.files[-1]
        program = cl.clCreateProgramWithSource(fn.context,
                                               kernel.codegen()).build()
        kernel_ptr = program[kernel.body[0].name]

        return fn.finalize(tree, ct.CFUNCTYPE(*entry_type), entry_point,
                           kernel_ptr)
示例#12
0
    def finalize(self, transform_result, program_config):
        kernel, cfile = transform_result
        proj = Project([kernel, cfile])
        fn = OpFunction()

        program = cl.clCreateProgramWithSource(fn.context,
                                               kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel,
                                  cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
示例#13
0
        def compile(self):
            if self.kernel is None:
                params = set(self.sources) | set(self.sinks)
                seen_decls = set()
                seen_params = set()
                decls = []
                filtered = set()
                for param in params:
                    if param.level == 'register':
                        if param.name not in seen_decls:
                            seen_decls.add(param.name)
                            decls.append('float {}'.format(param.name))
                    else:
                        if param.name not in seen_params:
                            seen_params.add(param.name)
                            filtered.add(param)
                self.params = list(filtered)
                self.params.sort(key=lambda x: x.name)
                params = []
                sinks = set(sink.name for sink in self.sinks)
                for param in self.params:
                    if param.name in sinks:
                        str = "global float* {}".format(param.name)
                    else:
                        str = "global const float* {}".format(param.name)
                    params.append(str)
                params_str = ", ".join(params)
                decls = ";\n\t\t\t".join(decls) + ";\n"
                kernel_name = get_unique_kernel_name()
                kernel = Template("""
    __kernel void $name($params) {
        int index = get_global_id(0);
        if (index < $num_work_items) {
            $decls
$body
        }
    }
        """).substitute(name=kernel_name,
                        params=params_str,
                        body=self.body,
                        decls=decls,
                        num_work_items=self.launch_parameters[0])
                # print([p.name for p in self.params])
                print(kernel)
                self.kernel_str = kernel
                kernel = cl.clCreateProgramWithSource(
                    context, kernel).build()[kernel_name]
                kernel.argtypes = tuple(cl.cl_mem for _ in self.params)
                self.kernel = kernel
示例#14
0
    def old_transform(self, tree, program_config):
        call_args = program_config[0]

        base_size = call_args.base_shape[0] * call_args.base_shape[1]
        border = call_args.border

        body = StringTemplate("""
            void __kernel matrix_powers_copy_base_layer(__global const $type* input, __global $type* output) {
                int x = get_global_id(0);
                int y = get_global_id(1);

                output[y * $len_x + x] = input[y * $len_x + x];
            }
            void __kernel matrix_powers_compute_next_step(__global $type* matrix, const int power) {
                int x = get_global_id(0);
                int y = get_global_id(1);

                matrix[(power+1) * $base_size + y * $len_x + x] =
                    0.1f * matrix[
                        power * $base_size + clamp(y-1, $border, $len_y-$border-1) * $len_x +  clamp(x, $border, $len_x-$border-1)
                    ] +
                    0.1f * matrix[
                        power * $base_size + clamp(y+1, $border, $len_y-$border-1) * $len_x +  clamp(x, $border, $len_x-$border-1)
                    ] +
                    0.4f * matrix[
                        power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x +  clamp(x-1, $border, $len_x-$border-1)
                    ] +
                    0.4f * matrix[
                        power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x +  clamp(x+1, $border, $len_x-$border-1)
                    ] +
                    1.0f * matrix[
                        power * $base_size + clamp(y, $border, $len_y-$border-1) * $len_x +  clamp(x, $border, $len_x-$border-1)
                    ];
            }
        """, {
            'type': SymbolRef('float'),
            'len_x': Constant(call_args.base_shape[1]),
            'len_y': Constant(call_args.base_shape[0]),
            'base_size': Constant(base_size),
            'border': Constant(border),
        })

        fn = OclMatrixPowers()
        kernel = OclFile("kernel", [body])
        # print(kernel.codegen())
        program = clCreateProgramWithSource(fn.context, kernel.codegen()).build()
        ptr = program['matrix_powers_copy_base_layer']
        ptr2 = program['matrix_powers_compute_next_step']
        return fn.finalize(ptr, ptr2, (call_args.base_shape[1], call_args.base_shape[0]))
示例#15
0
        def compile(self):
            if self.kernel is None:
                params = set(self.sources) | set(self.sinks)
                seen_decls = set()
                seen_params = set()
                decls = []
                filtered = set()
                for param in params:
                    if param.level == 'register':
                        if param.name not in seen_decls:
                            seen_decls.add(param.name)
                            decls.append('float {}'.format(param.name))
                    else:
                        if param.name not in seen_params:
                            seen_params.add(param.name)
                            filtered.add(param)
                self.params = list(filtered)
                self.params.sort(key=lambda x: x.name)
                params = []
                sinks = set(sink.name for sink in self.sinks)
                for param in self.params:
                    if param.name in sinks:
                        str = "global float* {}".format(param.name)
                    else:
                        str = "global const float* {}".format(param.name)
                    params.append(str)
                params_str = ", ".join(params)
                decls = ";\n\t\t\t".join(decls) + ";\n"
                kernel_name = get_unique_kernel_name()
                kernel = Template("""
    __kernel void $name($params) {
        int index = get_global_id(0);
        if (index < $num_work_items) {
            $decls
$body
        }
    }
        """).substitute(name=kernel_name, params=params_str, body=self.body, decls=decls,
                        num_work_items=self.launch_parameters[0])
                # print([p.name for p in self.params])
                print(kernel)
                self.kernel_str = kernel
                kernel = cl.clCreateProgramWithSource(
                    context, kernel).build()[kernel_name]
                kernel.argtypes = tuple(cl.cl_mem for _ in self.params)
                self.kernel = kernel
示例#16
0
    def build_kernel(self, kernel_src, kernel_name, kernel_args):
        kernel_src = C.CFile('generated', [StringTemplate(
"""
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
"""
            ), kernel_src])
        try:
            program = cl.clCreateProgramWithSource(
                latte.config.cl_ctx, kernel_src.codegen()).build()
            kernel = program[kernel_name]
        except cl.BuildProgramFailureError as e:
            logger.error("Failed build program:\n %s", kernel_src.codegen())
            raise e
        self.kernels[kernel_name] = kernel
        for index, arg in enumerate(kernel_args):
            kernel.setarg(index, self.cl_buffers[arg], ctypes.sizeof(cl.cl_mem))
        logger.debug(kernel_src)
示例#17
0
def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices = devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking)


    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf,
            len(data['in_r']), cube_size,
            wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue,
                                                                         sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
示例#18
0
        def get_launcher(cls, sources, sinks, keyword, symbol_table):
            bottoms = sources
            concat_kern = Template("""
    __kernel void concat(global const float* bottom,
                         global float* top, int top_offset, int bot_offset) {
        int index = get_global_id(0);
        top[index + top_offset] = bottom[index + bot_offset];
    }
            """).substitute()
            program = cl.clCreateProgramWithSource(context,
                                                   concat_kern).build()
            kernel = program['concat']
            kernel.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int, cl.cl_int)

            class Launcher():
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    top = symbol_table[sinks[0].name]
                    bots = [symbol_table[b.name] for b in bottoms]
                    evts = []
                    concat_off = 0
                    for i in range(len(bottoms)):
                        count = np.prod(bots[i].shape[1:])
                        for n in range(bots[i].shape[0]):
                            top_offset = n * np.prod(
                                top.shape[1:]) + concat_off * np.prod(
                                    top.shape[2:])
                            evt = kernel(bots[i].ocl_buf, top.ocl_buf,
                                         top_offset,
                                         n * count).on(queues[n % len(queues)],
                                                       (count, ),
                                                       wait_for=wait_for)
                            evts.append(evt)
                        concat_off += bots[i].shape[1]
                    return evts

            return Launcher(sources, sinks)
示例#19
0
def ocl_init( ocl_src ):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1] # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None: break
    if use_devices is None: raise ValueError( "no GPU openCL device found" )
    assert use_devices is not None
    print( "OpenCL use_devices: " + str(use_devices) )

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource( context, ocl_src ).build()
    print prog
    #run_mxplusb( prog, queue )
    run_conv( prog, queue )
示例#20
0
    def transform(self, tree, program_config):

        kernelFunc = program_config[0]
        kernelPath = os.path.join(os.getcwd(), "..", "templates",
                                  "trainingkernels.tmpl.c")
        kernelInserts = {
            "kernelFunc": SymbolRef(kernelFunc),
        }
        kernel = OclFile("training_kernel",
                         [FileTemplate(kernelPath, kernelInserts)])

        wrapperPath = os.path.join(os.getcwd(), "..", "templates",
                                   "ocltrain.tmpl.c")
        wrapperInserts = {
            "kernel_path": kernel.get_generated_path_ref(),
            "kernelFunc": SymbolRef(kernelFunc)
        }
        wrapper = CFile("train", [FileTemplate(wrapperPath, wrapperInserts)])
        fn = OclTrainFunction()
        program = cl.clCreateProgramWithSource(fn.context,
                                               kernel.codegen()).build()
        return fn.finalize(program, Project([kernel, wrapper]), "train")
示例#21
0
        def get_launcher(cls, sources, sinks, keyword, symbol_table):
            bottoms = sources
            concat_kern = Template("""
    __kernel void concat(global const float* bottom,
                         global float* top, int top_offset, int bot_offset) {
        int index = get_global_id(0);
        top[index + top_offset] = bottom[index + bot_offset];
    }
            """).substitute()
            program = cl.clCreateProgramWithSource(context, concat_kern).build()
            kernel = program['concat']
            kernel.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int, cl.cl_int)

            class Launcher():
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    top = symbol_table[sinks[0].name]
                    bots = [symbol_table[b.name] for b in bottoms]
                    evts = []
                    concat_off = 0
                    for i in range(len(bottoms)):
                        count = np.prod(bots[i].shape[1:])
                        for n in range(bots[i].shape[0]):
                            top_offset = n * np.prod(top.shape[1:]) + concat_off * np.prod(top.shape[2:])
                            evt = kernel(
                                bots[i].ocl_buf, top.ocl_buf, top_offset, n * count).on(
                                    queues[n % len(queues)], (count, ), wait_for=wait_for)
                            evts.append(evt)
                        concat_off += bots[i].shape[1]
                    return evts


            return Launcher(sources, sinks)
示例#22
0
def ocl_init(ocl_src):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1]  # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None:
            break
    if use_devices is None:
        raise ValueError("no GPU openCL device found")
    assert use_devices is not None
    print ("OpenCL use_devices: " + str(use_devices))

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource(context, ocl_src).build()
    print prog
    # run_mxplusb( prog, queue )
    run_conv(prog, queue)
示例#23
0
 def transform(self, tree, program_config):
     # TODO: Have to flip indices, figure out why
     arg_cfg, tune_cfg = program_config
     output_name = unique_name()
     params = [
         SymbolRef(self.array_name, POINTER(c_float)(), _global=True,
                   _const=True),
         SymbolRef(arg_cfg[0][0], POINTER(c_float)(), _global=True,
                   _const=True),
         SymbolRef(output_name, POINTER(c_float)(), _global=True)
     ]
     defn = []
     defn.extend([
         Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d))
         for d in range(len(arg_cfg[0][2]))
     ])
     index = StringTemplate('element_id1 * $len_x + element_id0',
                            {'len_x': Constant(arg_cfg[0][2][1])})
     defn.append(
         Assign(
             ArrayRef(SymbolRef(params[-1].name), index),
             tree(
                 ArrayRef(SymbolRef(params[0].name), index),
                 ArrayRef(SymbolRef(params[1].name), index),
             )
         )
     )
     entry_point = unique_kernel_name()
     tree = FunctionDecl(None, entry_point, params, defn)
     tree.set_kernel()
     fn = ArrayOpConcrete(self.array, self.generate_output(output_name))
     kernel = OclFile("kernel", [tree])
     program = clCreateProgramWithSource(
         fn.context, kernel.codegen()
     ).build()
     ptr = program[entry_point]
     return fn.finalize(ptr, (arg_cfg[0][2][1], arg_cfg[0][2][0]))
示例#24
0
    def visit_KernelCall(self, node):
        args = []
        for i, arg in enumerate(node.args):
            size = SizeOf(SymbolRef(arg.name))
            setter = clSetKernelArg(node.name, i, size, Ref(SymbolRef(arg.name)))
            setter.lift(params=arg._lift_params)
            args.append(setter)

        kernel_decl = SymbolRef(node.name, cl.cl_kernel())
        kernel_symbol = kernel_decl.copy()
        call = clEnqueueNDRangeKernel(node.location.symbol.copy(), kernel_symbol, work_dim=Constant(1), global_size=node.global_size, local_size=node.local_size)

        kernel = RefConverter().visit(node.kernel)
        for param in kernel.params:
            param.type = param.type.ptr_type
        kernel.defn.insert(0, Assign(SymbolRef("i", c_int()), get_global_id(0)))
        kernel_src = kernel.codegen()
        call.body.append(CppComment(kernel_src))

        context = node.location.queue.context
        kernel_ptr = cl.clCreateProgramWithSource(context, kernel_src).build()[node.name]
        call.lift(params=[(kernel_decl, kernel_ptr)])

        return args + [call]
示例#25
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        kernel_h, kernel_w = keywords['kernel_size']
        pad_h, pad_w = keywords['padding']
        stride_h, stride_w = keywords['stride']
        num, channels, height, width = symbol_table[sources[0]].shape
        channels_col = channels * kernel_h * kernel_w
        height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
        width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
        col_data = hmarray((channels_col, height_col * width_col))
        bias_multiplier = hmarray(
            (1, np.prod(symbol_table[sinks[0]].shape[2:])))
        bias_multiplier.fill(1.0)
        bias_multiplier.sync_ocl()

        im2col_global_size = channels * height_col * width_col
        col2im_global_size = channels * height * width

        kernels = Template("""
__kernel void col2im(global float* data_col, global float* data_im,
                     int im_offset) {
  if (get_global_id(0) < $col2im_global_size) {
    int index = get_global_id(0);
    float val = 0;
    int w = index % $width + $pad_w;
    int h = (index / $width) % $height + $pad_h;
    int c = index / ($width * $height);
    // compute the start and end of the output
    int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1;
    int w_col_end = min(w / $stride_w + 1, $width_col);
    int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1;
    int h_col_end = min(h / $stride_h + 1, $height_col);
    // equivalent implementation
    int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \
          $height_col * $width_col;
    int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \
          $width_col;
    int coeff_w_col = (1 - $stride_w * $height_col * $width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
      }
    }
    data_im[im_offset + index] = val;
  }
}
__kernel void im2col(global const float* data_im, global float* data_col,
                     int bot_offset) {
  if (get_global_id(0) < $global_size) {
    int index = get_global_id(0);
    int w_out = index % $width_col;
    int h_index = index / $width_col;
    int h_out = h_index % $height_col;
    int channel_in = h_index / $height_col;
    int channel_out = channel_in * $kernel_h * $kernel_w;
    int h_in = h_out * $stride_h - $pad_h;
    int w_in = w_out * $stride_w - $pad_w;
    global float* data_col_ptr = data_col;
    data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
    global const float* data_im_ptr = data_im + bot_offset;
    data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
    for (int i = 0; i < $kernel_h; ++i) {
      for (int j = 0; j < $kernel_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
            data_im_ptr[i * $width + j] : 0;
        data_col_ptr += $height_col * $width_col;
      }
    }
  }
}
""").substitute(global_size=im2col_global_size, stride_h=stride_h,
                stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                height=height, height_col=height_col,
                width_col=width_col, col2im_global_size=col2im_global_size)

        program = cl.clCreateProgramWithSource(context, kernels).build()
        im2col = program['im2col']
        im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)
        col2im = program['col2im']
        col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

        class ConvLauncher(object):
            def __init__(self, sources, sinks):
                self.sources = [ast.Name(s, ast.Load()) for s in sources]
                self.sinks = [ast.Name(s, ast.Load()) for s in sinks]

            def compile(self):
                pass

            def launch(self, symbol_table, wait_for):
                queue = queues[0]
                bottom = symbol_table[sources[0]]
                bot_offset = np.prod(bottom.shape[1:])
                top_diff = symbol_table[sources[1]]
                top_offset = np.prod(top_diff.shape[1:])
                weights = symbol_table[sources[2]]
                bottom_diff = symbol_table[sinks[0]]
                bottom_diff.fill(0)
                bottom_diff.sync_ocl()
                weights_diff = symbol_table[sinks[1]]
                weights_diff.fill(0)
                weights_diff.sync_ocl()
                bias_diff = symbol_table[sinks[2]]
                bias_diff.fill(0)
                bias_diff.sync_ocl()
                for i in range(bottom.shape[0]):
                    n = np.prod(top_diff.shape[2:])
                    sgemv(False, top_diff.shape[1],
                          n, 1.0, top_diff, i *
                          top_offset, n, bias_multiplier, 0, 1, 1.0,
                          bias_diff, 0, 1)
                    im2col(bottom.ocl_buf, col_data.ocl_buf, i
                           * bot_offset).on(queue, im2col_global_size)
                    m = top_diff.shape[1]
                    n = col_data.shape[0]
                    k = col_data.shape[1]

                    sgemm(False, True, 1.0, top_diff, i *
                          top_offset, k, col_data, 0, k, 1.0,
                          weights_diff, 0, n, m, n, k)

                    m = weights.shape[1]
                    n = col_data.shape[1]
                    k = weights.shape[0]

                    sgemm(True, False, 1.0, weights, 0, m,
                          top_diff, i * top_offset, n, 0.0,
                          col_data, 0, n,
                          m, n, k)
                    col2im(col_data.ocl_buf,
                           bottom_diff.ocl_buf, i *
                           bot_offset).on(queue, col2im_global_size)

        return ConvLauncher(sources, sinks)
示例#26
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        bottom = symbol_table[sources[0]]
        num = bottom.shape[0]
        channels = bottom.shape[1]
        scale_shape = list(bottom.shape)
        scale_shape[1] = 1
        scale = hmarray(tuple(scale_shape))
        loss = hmarray(scale_shape)
        spatial_dim = int(np.prod(bottom.shape[2:]))
        count = np.prod(bottom.shape)

        kernels = Template("""
// @begin=cl@
__kernel void kernel_copy(global const float* data,
                          global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] = data[index];
  }
}
__kernel void kernel_channel_max(global const float* data,
                                 global float* out) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    int n = index / $spatial_dim;
    int s = index % $spatial_dim;
    float maxval = -FLT_MAX;
    for (int c = 0; c < $channels; ++c) {
      maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
    }
    out[index] = maxval;
  }
}
__kernel void kernel_channel_subtract(global const float* channel_max,
                                      global float* data) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    int n = index / $channels / $spatial_dim;
    int s = index % $spatial_dim;
    data[index] -= channel_max[n * $spatial_dim + s];
  }
}
__kernel void kernel_exp(global const float* data, global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] = exp(data[index]);
  }
}
__kernel void kernel_channel_sum(global const float* data,
                                 global float* channel_sum) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    int n = index / $spatial_dim;
    int s = index % $spatial_dim;
    float sum = 0;
    for (int c = 0; c < $channels; ++c) {
      sum += data[(n * $channels + c) * $spatial_dim + s];
    }
    channel_sum[index] = sum;
  }
}
__kernel void kernel_channel_div(global const float* channel_sum,
                                 global float* data) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    int n = index / $channels / $spatial_dim;
    int s = index % $spatial_dim;
    data[index] /= channel_sum[n * $spatial_dim + s];
  }
}
__kernel void SoftmaxLossForward(global const float* prob_data,
    global const float* label, global float* loss) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    const int n = index / $spatial_dim;
    const int s = index % $spatial_dim;
    const int label_value = (int) label[n * $spatial_dim + s];
    loss[index] = -log(
        max(prob_data[n * $dim + label_value * $spatial_dim + s],
            FLT_MIN));
  }
}
// @end=cl@
""").substitute(count=count, num_times_spatial=num * spatial_dim,
                channels=channels, spatial_dim=spatial_dim,
                dim=np.prod(bottom.shape[1:]))

        program = cl.clCreateProgramWithSource(context, kernels).build()
        copy_kern = program['kernel_copy']
        copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        max_kern = program['kernel_channel_max']
        max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        sub_kern = program['kernel_channel_subtract']
        sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        exp_kern = program['kernel_exp']
        exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        sum_kern = program['kernel_channel_sum']
        sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        div_kern = program['kernel_channel_div']
        div_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        loss_forward = program['SoftmaxLossForward']
        loss_forward.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem)

        class SoftmaxLauncher(object):
            def compile(self):
                pass

            def launch(self, symbol_table):
                bottom = symbol_table[sources[0]]
                label = symbol_table[sources[1]]
                prob = symbol_table[sources[2]]
                top = symbol_table[sinks[0]]
                copy_kern(bottom.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                max_kern(prob.ocl_buf, scale.ocl_buf).on(queue,
                                                         (num * spatial_dim, ))
                sub_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                exp_kern(prob.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                sum_kern(prob.ocl_buf, scale.ocl_buf).on(queue,
                                                         (num * spatial_dim,))
                div_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                loss_forward(prob.ocl_buf, label.ocl_buf,
                             loss.ocl_buf).on(queue, (num * spatial_dim, ))
                loss.sync_host()
                top[0] = np.sum(loss) / np.float32(num)
                top.sync_ocl()

        return SoftmaxLauncher()
示例#27
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        kernel_h, kernel_w = keywords['kernel_size']
        pad_h, pad_w = keywords['padding']
        stride_h, stride_w = keywords['stride']
        num, channels, height, width = symbol_table[sources[0]].shape
        channels_col = channels * kernel_h * kernel_w
        height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
        width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
        col_data = hmarray((channels_col, height_col * width_col))
        bias_multiplier = hmarray(
            (1, np.prod(symbol_table[sinks[0]].shape[2:])))
        bias_multiplier.fill(1.0)
        bias_multiplier.sync_ocl()

        im2col_global_size = channels * height_col * width_col
        col2im_global_size = channels * height * width

        kernels = Template("""
__kernel void col2im(global float* data_col, global float* data_im,
                     int im_offset) {
  if (get_global_id(0) < $col2im_global_size) {
    int index = get_global_id(0);
    float val = 0;
    int w = index % $width + $pad_w;
    int h = (index / $width) % $height + $pad_h;
    int c = index / ($width * $height);
    // compute the start and end of the output
    int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1;
    int w_col_end = min(w / $stride_w + 1, $width_col);
    int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1;
    int h_col_end = min(h / $stride_h + 1, $height_col);
    // equivalent implementation
    int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \
          $height_col * $width_col;
    int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \
          $width_col;
    int coeff_w_col = (1 - $stride_w * $height_col * $width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
      }
    }
    data_im[im_offset + index] = val;
  }
}
__kernel void im2col(global const float* data_im, global float* data_col,
                     int bot_offset) {
  if (get_global_id(0) < $global_size) {
    int index = get_global_id(0);
    int w_out = index % $width_col;
    int h_index = index / $width_col;
    int h_out = h_index % $height_col;
    int channel_in = h_index / $height_col;
    int channel_out = channel_in * $kernel_h * $kernel_w;
    int h_in = h_out * $stride_h - $pad_h;
    int w_in = w_out * $stride_w - $pad_w;
    global float* data_col_ptr = data_col;
    data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
    global const float* data_im_ptr = data_im + bot_offset;
    data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
    for (int i = 0; i < $kernel_h; ++i) {
      for (int j = 0; j < $kernel_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
            data_im_ptr[i * $width + j] : 0;
        data_col_ptr += $height_col * $width_col;
      }
    }
  }
}
""").substitute(global_size=im2col_global_size,
                stride_h=stride_h,
                stride_w=stride_w,
                pad_h=pad_h,
                pad_w=pad_w,
                kernel_h=kernel_h,
                kernel_w=kernel_w,
                width=width,
                height=height,
                height_col=height_col,
                width_col=width_col,
                col2im_global_size=col2im_global_size)

        program = cl.clCreateProgramWithSource(context, kernels).build()
        im2col = program['im2col']
        im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)
        col2im = program['col2im']
        col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

        class ConvLauncher(object):
            def __init__(self, sources, sinks):
                self.sources = [ast.Name(s, ast.Load()) for s in sources]
                self.sinks = [ast.Name(s, ast.Load()) for s in sinks]

            def compile(self):
                pass

            def launch(self, symbol_table, wait_for):
                queue = queues[0]
                bottom = symbol_table[sources[0]]
                bot_offset = np.prod(bottom.shape[1:])
                top_diff = symbol_table[sources[1]]
                top_offset = np.prod(top_diff.shape[1:])
                weights = symbol_table[sources[2]]
                bottom_diff = symbol_table[sinks[0]]
                bottom_diff.fill(0)
                bottom_diff.sync_ocl()
                weights_diff = symbol_table[sinks[1]]
                weights_diff.fill(0)
                weights_diff.sync_ocl()
                bias_diff = symbol_table[sinks[2]]
                bias_diff.fill(0)
                bias_diff.sync_ocl()
                for i in range(bottom.shape[0]):
                    n = np.prod(top_diff.shape[2:])
                    sgemv(False, top_diff.shape[1], n, 1.0, top_diff,
                          i * top_offset, n, bias_multiplier, 0, 1, 1.0,
                          bias_diff, 0, 1)
                    im2col(bottom.ocl_buf, col_data.ocl_buf,
                           i * bot_offset).on(queue, im2col_global_size)
                    m = top_diff.shape[1]
                    n = col_data.shape[0]
                    k = col_data.shape[1]

                    sgemm(False, True, 1.0, top_diff, i * top_offset, k,
                          col_data, 0, k, 1.0, weights_diff, 0, n, m, n, k)

                    m = weights.shape[1]
                    n = col_data.shape[1]
                    k = weights.shape[0]

                    sgemm(True, False, 1.0, weights, 0, m, top_diff,
                          i * top_offset, n, 0.0, col_data, 0, n, m, n, k)
                    col2im(col_data.ocl_buf, bottom_diff.ocl_buf,
                           i * bot_offset).on(queue, col2im_global_size)

        return ConvLauncher(sources, sinks)
示例#28
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[
                sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [
                    hmarray((channels_col, height_col * width_col))
                    for _ in range(len(queues))
                ]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size,
                    stride_h=stride_h,
                    stride_w=stride_w,
                    pad_h=pad_h,
                    pad_w=pad_w,
                    kernel_h=kernel_h,
                    kernel_w=kernel_w,
                    width=width,
                    height=height,
                    height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(context,
                                                  im2col).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        bottom,
                                        i * bot_offset,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                         col_datas[i % len(queues)].ocl_buf,
                                         i * bot_offset).on(
                                             queues[i % len(queues)],
                                             (padded, ),
                                             wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        col_datas[i % len(queues)],
                                        0,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)

            return ConvLauncher(sources, sinks)
示例#29
0
    def finalize(self, transform_result, program_config):
        project = Project(transform_result)
        arg_config, tuner_config = program_config

        self.output = self.generate_output(program_config)
        param_types = [
            np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape)
            for arg in arg_config + (self.output, )
        ]
        if self.backend == StencilOclTransformer:
            entry_point = "stencil_control"
            param_types.append(param_types[0])
            entry_type = [c_int32, cl.cl_command_queue, cl.cl_kernel]
            if self.kernel.is_copied:
                for _ in range(self.kernel.dim):
                    entry_type.append(cl.cl_kernel)
            entry_type.extend(cl_mem for _ in range(len(arg_config) + 1))
            entry_type = CFUNCTYPE(*entry_type)
        else:
            entry_point = "stencil_kernel"
            param_types.append(POINTER(c_float))
            entry_type = CFUNCTYPE(c_int32, *param_types)

        if self.backend == StencilOclTransformer:
            concrete_function = OclStencilFunction()
            if self.kernel.is_copied:
                args = [
                    project, entry_type, entry_point,
                ]
                kernels = []
                for index, kernel in enumerate(project.find_all(OclFile)):
                    # print("XXX index {} kernel {}".format(index, kernel.name))
                    print("Kernel Codegen\n".format(kernel.codegen()))
                    program = clCreateProgramWithSource(
                        concrete_function.context, kernel.codegen()).build()
                    if index == 0:
                        ocl_kernel_name = 'stencil_kernel'
                    else:
                        ocl_kernel_name = kernel.name
                    kernel_ptr = program[ocl_kernel_name]
                    kernels.append(kernel_ptr)
                args.append(kernels)
                args.append(self.output)

                finalized = concrete_function.finalize(*args)
            else:
                kernel = project.find(OclFile)
                program = clCreateProgramWithSource(concrete_function.context,
                                                    kernel.codegen()).build()
                stencil_kernel_ptr = program['stencil_kernel']
                finalized = concrete_function.finalize(
                    project, entry_type, entry_point,
                    stencil_kernel_ptr,
                    self.output
                )
        else:
            concrete_function = ConcreteStencil()
            finalized = concrete_function.finalize(project, entry_point,
                                                   entry_type, self.output)
        self.output = None
        self.fusable_nodes = []
        return finalized

        concrete_function = ConcreteStencil()
        return concrete_function.finalize(entry_point, project, entry_type)
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    __kernel void kernel_copy(global const float* data,
                              global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = data[index];
      }
    }
    __kernel void kernel_channel_max(global const float* data,
                                     global float* out) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    __kernel void kernel_channel_subtract(global const float* channel_max,
                                          global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    __kernel void kernel_exp(global const float* data, global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = exp(data[index]);
      }
    }
    __kernel void kernel_channel_sum(global const float* data,
                                     global float* channel_sum) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    __kernel void kernel_channel_div(global const float* channel_sum,
                                     global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count, num_times_spatial=num * spatial_dim,
                    channels=channels, spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))

            program = cl.clCreateProgramWithSource(context, kernels).build()
            copy_kern = program['kernel_copy']
            copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            max_kern = program['kernel_channel_max']
            max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sub_kern = program['kernel_channel_subtract']
            sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            exp_kern = program['kernel_exp']
            exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sum_kern = program['kernel_channel_sum']
            sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            div_kern = program['kernel_channel_div']
            div_kern.argtypes = (cl.cl_mem, cl.cl_mem)

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sources

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    if count % 16:
                        padded_count = (count + 15) & (~15)
                    else:
                        padded_count = count
                    num_times_spatial = num * spatial_dim
                    if num_times_spatial % 16:
                        padded_num_times_spatial = (num_times_spatial + 15) & (~15)
                    else:
                        padded_num_times_spatial = num_times_spatial
                    evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count,), wait_for=wait_for)
                    evt = max_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = sub_kern(scale.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    evt = exp_kern(top.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    evt = sum_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = div_kern(scale.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    return [evt]

            return SoftmaxLauncher(sources, sinks)
示例#31
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [hmarray((channels_col, height_col * width_col))
                            for _ in range(len(queues))]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size, stride_h=stride_h,
                    stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                    kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                    height=height, height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(
                context, im2col
            ).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        bottom, i * bot_offset, n, 0.0,
                                        top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                        col_datas[i % len(queues)].ocl_buf,
                                        i * bot_offset
                                        ).on(queues[i % len(queues)], (padded, ),
                                            wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        col_datas[i % len(queues)],
                                        0, n, 0.0, top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=evt)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)
            return ConvLauncher(sources, sinks)
示例#32
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    __kernel void kernel_copy(global const float* data,
                              global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = data[index];
      }
    }
    __kernel void kernel_channel_max(global const float* data,
                                     global float* out) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    __kernel void kernel_channel_subtract(global const float* channel_max,
                                          global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    __kernel void kernel_exp(global const float* data, global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = exp(data[index]);
      }
    }
    __kernel void kernel_channel_sum(global const float* data,
                                     global float* channel_sum) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    __kernel void kernel_channel_div(global const float* channel_sum,
                                     global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count,
                    num_times_spatial=num * spatial_dim,
                    channels=channels,
                    spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))

            program = cl.clCreateProgramWithSource(context, kernels).build()
            copy_kern = program['kernel_copy']
            copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            max_kern = program['kernel_channel_max']
            max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sub_kern = program['kernel_channel_subtract']
            sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            exp_kern = program['kernel_exp']
            exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sum_kern = program['kernel_channel_sum']
            sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            div_kern = program['kernel_channel_div']
            div_kern.argtypes = (cl.cl_mem, cl.cl_mem)

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sources

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    if count % 16:
                        padded_count = (count + 15) & (~15)
                    else:
                        padded_count = count
                    num_times_spatial = num * spatial_dim
                    if num_times_spatial % 16:
                        padded_num_times_spatial = (num_times_spatial +
                                                    15) & (~15)
                    else:
                        padded_num_times_spatial = num_times_spatial
                    evt = copy_kern(bottom.ocl_buf,
                                    top.ocl_buf).on(queue, (padded_count, ),
                                                    wait_for=wait_for)
                    evt = max_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = sub_kern(scale.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    evt = exp_kern(top.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    evt = sum_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = div_kern(scale.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    return [evt]

            return SoftmaxLauncher(sources, sinks)
示例#33
0
    def transform(self, tree, program_config):
        call_args = program_config[0]

        base_size = call_args.base_shape[0] * call_args.base_shape[1]
        border = call_args.border

        c_float_type = c_float
        c_int_type = c_int

        transformer = PyBasicConversions()

        output = unique_name()

        init_entry_point = unique_kernel_name()
        init_params = [
            SymbolRef('input', POINTER(c_float_type)(), _global=True, _const=True),
            SymbolRef(output, POINTER(c_float_type)(), _global=True),
        ]

        init_defn = []
        init_defn.extend([
            Assign(SymbolRef('x', c_int()), get_global_id(0)),
            Assign(SymbolRef('y', c_int()), get_global_id(1)),
        ])

        body = """{output}[y * {len_x} + x] = input[y * {len_x} + x]""".format(
            output=output, len_x=call_args.base_shape[0]
        )
        print(body)
        tree_body = ast.parse(body).body

        init_defn.extend(tree_body)

        init_tree = FunctionDecl(None, init_entry_point, init_params, init_defn)
        init_tree.set_kernel()
        init_kernel = OclFile('kernel', [init_tree])
        init_kernel = transformer.visit(init_kernel)
        print("init kernel codegen")
        print(init_kernel.codegen())

        compute_entry_point = unique_kernel_name()
        compute_params = [
            SymbolRef(output, POINTER(c_float_type)(), _global=True),
            SymbolRef('power', c_int(), _const=True),
        ]
        compute_defn = []
        compute_defn.extend([
            Assign(SymbolRef('x', c_int()), get_global_id(0)),
            Assign(SymbolRef('y', c_int()), get_global_id(1)),
        ])

        body = """{matrix}[(power+1) * {base_size} + y * {len_x} + x] =
                    0.1 * {matrix}[
                        power * {base_size} + clamp(y-1, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ] +
                    0.1 * {matrix}[
                        power * {base_size} + clamp(y+1, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ] +
                    0.4 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x-1, {border}, {len_x}-{border}-1)
                    ] +
                    0.4 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x+1, {border}, {len_x}-{border}-1)
                    ] +
                    1.0 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ]
        """.format(
            matrix=output,
            base_size=base_size,
            len_y=call_args.base_shape[0],
            len_x=call_args.base_shape[1],
            border=border,
        )

        body = re.sub("""\s\s*""", " ", body)
        print(body)
        tree_body = ast.parse(body).body

        compute_defn.extend(tree_body)

        compute_tree = FunctionDecl(None, compute_entry_point, compute_params, compute_defn)
        compute_tree.set_kernel()
        compute_kernel = OclFile('kernel', [compute_tree])
        compute_kernel = transformer.visit(compute_kernel)
        print("compute kernel codegen")
        print(compute_kernel.codegen())


        fn = OclMatrixPowers()
        init_program = clCreateProgramWithSource(fn.context, init_kernel.codegen()).build()
        init_ptr = init_program[init_entry_point]

        compute_program = clCreateProgramWithSource(fn.context, compute_kernel.codegen()).build()
        compute_ptr = compute_program[compute_entry_point]

        return fn.finalize(init_ptr, compute_ptr, (call_args.base_shape[1], call_args.base_shape[0]))
示例#34
0
    def transform(self, tree, program_config):
        A = program_config[0]
        len_A = np.prod(A.shape)
        inner_type = A.dtype.type()
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type
        apply_one.params[1].type = inner_type


        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("len", ct.c_int())
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),                          # getting the group id for this work group
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),                        # getting the global id for this work item
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),                          # getting the local id for this work item
                                        For(Assign(SymbolRef('i', ct.c_int()), Constant(1)),                                # for(int i=1; i<WORK_GROUP_SIZE; i *= 2)
                                            Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)),                                  
                                            MulAssign(SymbolRef('i'), Constant(2)),
                                            [
                                                If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))),     # if statement checks 
                                                          Constant(0)),
                                                       Lt(Add(SymbolRef('globalId'), SymbolRef('i')),
                                                          SymbolRef("len"))),
                                                   [
                                                       Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')),
                                                              FunctionCall(SymbolRef('apply'),
                                                                           [
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        SymbolRef('globalId')),
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        Add(SymbolRef('globalId'),
                                                                                            SymbolRef('i')))
                                                                           ])),
                                                   ]
                                                ),
                                                FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')])
                                            ]
                                        ),
                                        If(Eq(SymbolRef('localId'), Constant(0)),
                                           [
                                               Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')),
                                                      ArrayRef(SymbolRef('A'), SymbolRef('globalId')))
                                           ]
                                        )
                                    ]
        ).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $n;
            size_t local = $local;
            intptr_t len = $length;
            cl_mem swap;
            for (int runs = 0; runs < $run_limit ; runs++){
                clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
                clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
                clSetKernelArg(kernel, 2, sizeof(intptr_t), &len);
                clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
                swap = buf;
                buf = out_buf;
                out_buf = swap;
                len  = len/local + (len % local != 0);
            }
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)),
              'length': Constant(len_A),
              'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE)))
        })

        proj = Project([kernel, CFile("generated", [control])])
        fn = ConcreteXorReduction()

        program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
示例#35
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            num, channels, height, width = symbol_table[sources[0].name].shape
            local_size = keywords['local_size']
            alpha = keywords['alpha']
            k = keywords['k']
            beta = keywords['beta']
            compute_global = (num * channels * height * width, )
            fill_global = (num * height * width, )
            kernel = Template("""
    // @begin=cl@
    __kernel void LRNFillScale(global const float* in, global float* scale) {
      if (get_global_id(0) < $fill_global) {
        int index = get_global_id(0);
        int w = index % $width;
        int h = (index / $width) % $height;
        int n = index / $width / $height;
        int offset = (n * $channels * $height + h) * $width + w;
        int step = $height * $width;
        in += offset;
        scale += offset;
        int head = 0;
        int pre_pad = ($local_size - 1) / 2;
        int post_pad = $local_size - pre_pad - 1;
        float accum_scale = 0;
        // fill the scale at [n, :, h, w]
        // accumulate values
        while (head < post_pad && head < $channels) {
          accum_scale += in[head * step] * in[head * step];
          ++head;
        }
        // both add and subtract
        while (head < $channels) {
          accum_scale += in[head * step] * in[head * step];
          if (head - $local_size >= 0) {
            accum_scale -= in[(head - $local_size) * step] * \
                in[(head - $local_size) * step];
          }
          scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size;
          ++head;
        }
        // subtract only
        while (head < $channels + post_pad) {
          if (head - $local_size >= 0) {
            accum_scale -= in[(head - $local_size) * step] * \
                in[(head - $local_size) * step];
          }
          scale[(head - post_pad) * step] = $k + accum_scale * $alpha_over_size;
          ++head;
        }
      }
    }
    __kernel void LRNComputeOutput(global const float* in,
                                   global const float* scale,
                                   global float* out) {
      if (get_global_id(0) < $compute_global) {
        int index = get_global_id(0);
        out[index] = in[index] * pow(scale[index], (float)$negative_beta);
      }
    }
    // @end=cl@
    """).substitute(width=width, height=height, channels=channels,
                    local_size=local_size,
                    alpha_over_size=float(alpha) / local_size,
                    k=k, negative_beta=-beta, fill_global=fill_global[0],
                    compute_global=compute_global[0])
            program = cl.clCreateProgramWithSource(
                context, kernel
            ).build()
            fill_kern = program['LRNFillScale']
            fill_kern.argtypes = (cl.cl_mem, cl.cl_mem)

            compute_kern = program['LRNComputeOutput']
            compute_kern.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem)

            class LrnLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    scale = symbol_table[sinks[1].name]
                    if fill_global[0] % 16:
                        padded = (fill_global[0] + 15) & (~15)
                    else:
                        padded = fill_global[0]
                    evt = fill_kern(bottom.ocl_buf, scale.ocl_buf).on(queue, (padded,), wait_for=wait_for)
                    if compute_global[0] % 16:
                        padded = (compute_global[0] + 15) & (~15)
                    else:
                        padded = compute_global[0]
                    evt = compute_kern(bottom.ocl_buf, scale.ocl_buf,
                                       top.ocl_buf).on(queue, (padded,), wait_for=evt)
                    return [evt]
            return LrnLauncher(sources, sinks)