Exemplo n.º 1
0
    def gen_index_in_bounds_conditional(self, body, is_low_side=True):
        """
        provide bounds checking if the virtual grid size differs from grid size
        :param body:
        :return:
        """
        def is_conditional_required_for(d):
            if self.virtual_global_size[d] == self.global_size[d]:
                return False
            if d == self.dimension and is_low_side:
                return False
            return True

        conditional = None
        for dim in range(self.dimensions):
            if is_conditional_required_for(dim):
                if conditional is None:
                    conditional = Lt(get_global_id(dim), Constant(self.global_size[dim]))
                else:
                    conditional = And(conditional, Lt(get_global_id(dim), Constant(self.global_size[dim])))

        if conditional is None:
            return body
        else:
            return If(conditional, body)
Exemplo n.º 2
0
 def visit_SymbolRef(self, node):
     if node.name == self.loop_var:
         index = get_global_id(self.ndim - 1)
         for d in reversed(range(self.ndim - 1)):
             index = Add(
                 Mul(
                     index,
                     Constant(self.shape[d])
                 ), get_global_id(d)
             )
         return index
     return node
Exemplo n.º 3
0
    def transform(self, tree, program_config):
        arg_cfg = program_config[0]
        self.entry_point = unique_kernel_name()
        ctypeObject = c_float()
        ctype = c_float
        len_x = arg_cfg[0][1][0]
        len_y = arg_cfg[0][1][1]
        output = unique_name()
        params = [
            SymbolRef("input", POINTER(ctype)(), _global=True, _const=True),
            SymbolRef(output, POINTER(ctype)(), _global=True)
        ]
        defn = []
        defn.extend([
            Assign(SymbolRef('x', c_int()), get_global_id(0)),
            Assign(SymbolRef('y', c_int()), get_global_id(1)),
            Assign(SymbolRef('temp', ctypeObject), Constant(0)),
        ])
        body = \
            """
temp = .5 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y +
                  clamp(y/2, 0, (len_y / 2) - 1)]
if (x & 0x1):
    temp += .25 * input[clamp(x/2 + 1, 0, (len_x / 2) - 1) * len_y +
                        clamp(y/2, 0, (len_y /  2) - 1)]
else:
    temp += .25 * input[clamp(x/2 - 1, 0, (len_x / 2) - 1) * len_y +
                        clamp(y/2, 0, (len_y / 2) - 1)]
if (y & 0x1):
    temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y +
                        clamp(y/2 + 1, 0, (len_y / 2) - 1)]
else:
    temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) *len_y +
                        clamp(y/2 - 1, 0, (len_y / 2) - 1)]
output[x * len_y + y] = temp
"""
        body = ast.parse(body).body
        name_dict = {
            'output': output
        }
        const_dict = {
            'len_x': len_x,
            'len_y': len_y,
        }
        transformation = PyBasicConversions(name_dict, const_dict)
        defn.extend(body)
        tree = FunctionDecl(None, self.entry_point, params, defn)
        tree.set_kernel()
        kernel = OclFile("kernel", [tree])
        kernel = transformation.visit(kernel)
        return kernel
Exemplo n.º 4
0
 def gen_global_index(self):
     dim = self.output_grid.ndim
     index = get_global_id(dim - 1)
     for d in reversed(range(dim - 1)):
         stride = self.output_grid.strides[d] // \
             self.output_grid.itemsize
         index = Add(
             index,
             Mul(
                 get_global_id(d),
                 Constant(stride)
             )
         )
     return index
Exemplo n.º 5
0
 def get_semantic_tree(self, arg, output_name):
     params = [
         SymbolRef(self.array_name, POINTER(c_float)(), _global=True,
                   _const=True),
         SymbolRef(arg.name, POINTER(c_float)(), _global=True, _const=True),
         SymbolRef(output_name, POINTER(c_float)(), _global=True)
     ]
     defn = []
     defn.extend([
         Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d))
         for d in range(len(arg.data.shape))
     ])
     index = StringTemplate('element_id1 * $len_x + element_id0',
                            {'len_x': Constant(arg.data.shape[1])})
     defn.append(
         Assign(
             ArrayRef(SymbolRef(params[-1].name), index),
             self.original_tree(
                 ArrayRef(SymbolRef(params[0].name), index),
                 ArrayRef(SymbolRef(params[1].name), index),
                 )
         )
     )
     entry_point = unique_kernel_name()
     tree = FunctionDecl(None, entry_point, params, defn)
     tree.set_kernel()
     kernel = OclFile("kernel", [tree])
     return kernel
Exemplo n.º 6
0
 def gen_global_index(self):
     dim = self.dimensions
     index = get_global_id(dim - 1)
     for d in reversed(range(dim - 1)):
         stride = self.grid.strides[d] // \
             self.grid.itemsize
         index = Add(
             index,
             Mul(
                 Add(
                     get_global_id(d),
                     Constant(self.global_offset[d])
                 ),
                 Constant(stride)
             )
         )
     return index
Exemplo n.º 7
0
 def gen_global_index_with_halo_offset(self):
     dim = self.dimensions
     index = get_global_id(dim - 1)
     if dim - 1 == self.dimension:
         index = Add(index, Constant(self.shape[dim-1] - self.halo[dim-1]))
     for d in reversed(range(dim - 1)):
         stride = self.grid.strides[d] // \
             self.grid.itemsize
         add_amount = Add(get_global_id(d), Constant(self.global_offset[d]))
         if d == self.dimension:
             add_amount = Add(add_amount, Constant(self.shape[self.dimension] - self.halo[self.dimension]))
         index = Add(
             index,
             Mul(
                 add_amount,
                 Constant(stride)
             )
         )
     return index
Exemplo n.º 8
0
 def transform(self, tree, program_config):
     # TODO: Have to flip indices, figure out why
     arg_cfg, tune_cfg = program_config
     output_name = unique_name()
     params = [
         SymbolRef(self.array_name, POINTER(c_float)(), _global=True,
                   _const=True),
         SymbolRef(arg_cfg[0][0], POINTER(c_float)(), _global=True,
                   _const=True),
         SymbolRef(output_name, POINTER(c_float)(), _global=True)
     ]
     defn = []
     defn.extend([
         Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d))
         for d in range(len(arg_cfg[0][2]))
     ])
     index = StringTemplate('element_id1 * $len_x + element_id0',
                            {'len_x': Constant(arg_cfg[0][2][1])})
     defn.append(
         Assign(
             ArrayRef(SymbolRef(params[-1].name), index),
             tree(
                 ArrayRef(SymbolRef(params[0].name), index),
                 ArrayRef(SymbolRef(params[1].name), index),
             )
         )
     )
     entry_point = unique_kernel_name()
     tree = FunctionDecl(None, entry_point, params, defn)
     tree.set_kernel()
     fn = ArrayOpConcrete(self.array, self.generate_output(output_name))
     kernel = OclFile("kernel", [tree])
     program = clCreateProgramWithSource(
         fn.context, kernel.codegen()
     ).build()
     ptr = program[entry_point]
     return fn.finalize(ptr, (arg_cfg[0][2][1], arg_cfg[0][2][0]))
Exemplo n.º 9
0
    def transform(self, tree, program_config):
        A = program_config[0]
        len_A = np.prod(A.shape)
        inner_type = A.dtype.type()
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type
        apply_one.params[1].type = inner_type


        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("len", ct.c_int())
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),                          # getting the group id for this work group
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),                        # getting the global id for this work item
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),                          # getting the local id for this work item
                                        For(Assign(SymbolRef('i', ct.c_int()), Constant(1)),                                # for(int i=1; i<WORK_GROUP_SIZE; i *= 2)
                                            Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)),                                  
                                            MulAssign(SymbolRef('i'), Constant(2)),
                                            [
                                                If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))),     # if statement checks 
                                                          Constant(0)),
                                                       Lt(Add(SymbolRef('globalId'), SymbolRef('i')),
                                                          SymbolRef("len"))),
                                                   [
                                                       Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')),
                                                              FunctionCall(SymbolRef('apply'),
                                                                           [
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        SymbolRef('globalId')),
                                                                               ArrayRef(SymbolRef('A'),
                                                                                        Add(SymbolRef('globalId'),
                                                                                            SymbolRef('i')))
                                                                           ])),
                                                   ]
                                                ),
                                                FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')])
                                            ]
                                        ),
                                        If(Eq(SymbolRef('localId'), Constant(0)),
                                           [
                                               Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')),
                                                      ArrayRef(SymbolRef('A'), SymbolRef('globalId')))
                                           ]
                                        )
                                    ]
        ).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $n;
            size_t local = $local;
            intptr_t len = $length;
            cl_mem swap;
            for (int runs = 0; runs < $run_limit ; runs++){
                clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
                clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
                clSetKernelArg(kernel, 2, sizeof(intptr_t), &len);
                clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
                swap = buf;
                buf = out_buf;
                out_buf = swap;
                len  = len/local + (len % local != 0);
            }
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)),
              'length': Constant(len_A),
              'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE)))
        })

        proj = Project([kernel, CFile("generated", [control])])
        fn = ConcreteXorReduction()

        program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
Exemplo n.º 10
0
    def transform(self, tree, program_config):
        # TODO: Have to flip indices, figure out why
        arg_cfg = program_config[0]

        input_name = arg_cfg[0][0]
        self.output_name = unique_name()
        params = [
            SymbolRef(
                input_name, POINTER(c_float)(), _global=True, _const=True),
            SymbolRef(self.output_name, POINTER(c_float)(), _global=True)
        ]
        defn = []
        defn.extend([
            Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d))
            for d in range(len(arg_cfg[0][2]))
        ])
        out_index = StringTemplate('element_id1 * $len_x + element_id0',
                                   {'len_x': Constant(arg_cfg[0][2][1])})
        defn.append(Assign(
            ArrayRef(SymbolRef(self.output_name), out_index),
            Div(
                Add(
                    ArrayRef(
                        SymbolRef(input_name),
                        StringTemplate(
                            '(element_id1 * 2) * $len_x + (element_id0 * 2)',
                            {'len_x': Constant(arg_cfg[0][2][1])})
                    ),
                    Add(
                        ArrayRef(
                            SymbolRef(input_name),
                            StringTemplate(
                                '(element_id1 * 2) * $len_x + \
                                 (element_id0 * 2 + 1)',
                                {'len_x': Constant(arg_cfg[0][2][1])})
                        ),
                        Add(
                            ArrayRef(
                                SymbolRef(input_name),
                                StringTemplate(
                                    '(element_id1 * 2 + 1) * $len_x + \
                                     (element_id0 * 2 + 1)',
                                    {'len_x': Constant(arg_cfg[0][2][1])})
                            ),
                            Add(
                                ArrayRef(
                                    SymbolRef(input_name),
                                    StringTemplate(
                                        '(element_id1 * 2 + 1) * $len_x + \
                                         (element_id0 * 2)',
                                        {'len_x': Constant(arg_cfg[0][2][1])})
                                ),
                            )
                        )
                    )
                ),
                Constant(4.0)
            )
        ))

        self.entry_point = unique_kernel_name()
        tree = FunctionDecl(None, self.entry_point, params, defn)
        tree.set_kernel()
        kernel = OclFile("kernel", [tree])
        return kernel
Exemplo n.º 11
0
    def visit_InteriorPointsLoop(self, node):
        dim = len(self.output_grid.shape)
        self.kernel_target = node.target
        condition = And(
            Lt(get_global_id(0),
               Constant(self.arg_cfg[0].shape[0] - self.ghost_depth[0])),
            GtE(get_global_id(0),
                Constant(self.ghost_depth[0]))
        )
        for d in range(1, len(self.arg_cfg[0].shape)):
            condition = And(
                condition,
                And(
                    Lt(get_global_id(d),
                       Constant(self.arg_cfg[0].shape[d] -
                                self.ghost_depth[d])),
                    GtE(get_global_id(d),
                        Constant(self.ghost_depth[d]))
                )
            )
        body = []

        self.macro_defns = [
            CppDefine("local_array_macro", ["d%d" % i for i in range(dim)],
                      self.gen_local_macro()),
            CppDefine("global_array_macro", ["d%d" % i for i in range(dim)],
                      self.gen_global_macro())
        ]
        body.extend(self.macro_defns)

        global_idx = 'global_index'
        self.output_index = global_idx
        body.append(Assign(SymbolRef('global_index', ct.c_int()),
                    self.gen_global_index()))

        self.load_mem_block = self.load_shared_memory_block(
            self.local_block, self.ghost_depth)
        body.extend(self.load_mem_block)
        body.append(FunctionCall(SymbolRef("barrier"),
                                 [SymbolRef("CLK_LOCAL_MEM_FENCE")]))
        for d in range(0, dim):
            body.append(Assign(SymbolRef('local_id%d' % d, ct.c_int()),
                               Add(get_local_id(d),
                                   Constant(self.ghost_depth[d]))))
            self.var_list.append("local_id%d" % d)

        for child in map(self.visit, node.body):
            if isinstance(child, list):
                self.stencil_op.extend(child)
            else:
                self.stencil_op.append(child)

        conditional = None
        for dim in range(len(self.output_grid.shape)):
            if self.virtual_global_size[dim] != self.global_size[dim]:
                if conditional is None:
                    conditional = Lt(get_global_id(dim),
                                     Constant(self.global_size[dim]))
                else:
                    conditional = And(conditional,
                                      Lt(get_global_id(dim),
                                         Constant(self.global_size[dim])))

        if conditional is not None:
            body.append(If(conditional, self.stencil_op))
        else:
            body.extend(self.stencil_op)

        # this does not help fix the failure
        # body.append(FunctionCall(SymbolRef("barrier"),
        #                          [SymbolRef("CLK_GLOBAL_MEM_FENCE")]))
        # body.extend(self.stencil_op)
        #
        # this line does seem to fix the problem, seems to suggest some timing
        # issue
        #
        # body.append(If(conditional,
        #                [StringTemplate("out_grid[global_index]+=0;")]))
        #
        # the following fixes the problem too, suggests timing issues
        #
        # body.append(FunctionCall(SymbolRef("printf"), [String("gid %d\\n"),
        #                                                SymbolRef("global_index")]))
        # from ctree.ocl.macros import get_group_id
        # body.append(
        #     FunctionCall(
        #         SymbolRef("printf"),
        #         [
        #             String("group_id %2d %2d gid %2d %2d %2d\\n"),
        #             get_global_id(0),
        #             get_group_id(1),
        #             get_global_id(0),
        #             get_global_id(1),
        #             SymbolRef('global_index'),
        #         ]
        #     )
        # )
        return body
Exemplo n.º 12
0
    def transform(self, tree, program_config):
        call_args = program_config[0]

        base_size = call_args.base_shape[0] * call_args.base_shape[1]
        border = call_args.border

        c_float_type = c_float
        c_int_type = c_int

        transformer = PyBasicConversions()

        output = unique_name()

        init_entry_point = unique_kernel_name()
        init_params = [
            SymbolRef('input', POINTER(c_float_type)(), _global=True, _const=True),
            SymbolRef(output, POINTER(c_float_type)(), _global=True),
        ]

        init_defn = []
        init_defn.extend([
            Assign(SymbolRef('x', c_int()), get_global_id(0)),
            Assign(SymbolRef('y', c_int()), get_global_id(1)),
        ])

        body = """{output}[y * {len_x} + x] = input[y * {len_x} + x]""".format(
            output=output, len_x=call_args.base_shape[0]
        )
        print(body)
        tree_body = ast.parse(body).body

        init_defn.extend(tree_body)

        init_tree = FunctionDecl(None, init_entry_point, init_params, init_defn)
        init_tree.set_kernel()
        init_kernel = OclFile('kernel', [init_tree])
        init_kernel = transformer.visit(init_kernel)
        print("init kernel codegen")
        print(init_kernel.codegen())

        compute_entry_point = unique_kernel_name()
        compute_params = [
            SymbolRef(output, POINTER(c_float_type)(), _global=True),
            SymbolRef('power', c_int(), _const=True),
        ]
        compute_defn = []
        compute_defn.extend([
            Assign(SymbolRef('x', c_int()), get_global_id(0)),
            Assign(SymbolRef('y', c_int()), get_global_id(1)),
        ])

        body = """{matrix}[(power+1) * {base_size} + y * {len_x} + x] =
                    0.1 * {matrix}[
                        power * {base_size} + clamp(y-1, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ] +
                    0.1 * {matrix}[
                        power * {base_size} + clamp(y+1, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ] +
                    0.4 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x-1, {border}, {len_x}-{border}-1)
                    ] +
                    0.4 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x+1, {border}, {len_x}-{border}-1)
                    ] +
                    1.0 * {matrix}[
                        power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} +  clamp(x, {border}, {len_x}-{border}-1)
                    ]
        """.format(
            matrix=output,
            base_size=base_size,
            len_y=call_args.base_shape[0],
            len_x=call_args.base_shape[1],
            border=border,
        )

        body = re.sub("""\s\s*""", " ", body)
        print(body)
        tree_body = ast.parse(body).body

        compute_defn.extend(tree_body)

        compute_tree = FunctionDecl(None, compute_entry_point, compute_params, compute_defn)
        compute_tree.set_kernel()
        compute_kernel = OclFile('kernel', [compute_tree])
        compute_kernel = transformer.visit(compute_kernel)
        print("compute kernel codegen")
        print(compute_kernel.codegen())


        fn = OclMatrixPowers()
        init_program = clCreateProgramWithSource(fn.context, init_kernel.codegen()).build()
        init_ptr = init_program[init_entry_point]

        compute_program = clCreateProgramWithSource(fn.context, compute_kernel.codegen()).build()
        compute_ptr = compute_program[compute_entry_point]

        return fn.finalize(init_ptr, compute_ptr, (call_args.base_shape[1], call_args.base_shape[0]))
Exemplo n.º 13
0
    def transform(self, tree, program_config):
        arg_cfg = program_config[0]

        len_x = Constant(arg_cfg[0][1][0])
        len_y = Constant(arg_cfg[0][1][1])

        self.entry_point = unique_kernel_name()

        body = FunctionDecl(
            None,
            self.entry_point,
            [
                SymbolRef('input', POINTER(c_float)(), _global=True,
                          _const=True),
                SymbolRef('u', POINTER(c_float)(), _global=True, _const=True),
                SymbolRef('v', POINTER(c_float)(), _global=True, _const=True),
                SymbolRef('output', POINTER(c_float)(), _global=True)
            ],
            [
                Assign(SymbolRef('x', c_int()), get_global_id(0)),
                Assign(SymbolRef('y', c_int()), get_global_id(1)),
                Assign(
                    SymbolRef('my_x', c_int()),
                    Cast(c_int(),
                         ArrayRef(SymbolRef('u'),
                                  Add(SymbolRef('x'),
                                      Mul(SymbolRef('y'), len_x))))),
                Assign(
                    SymbolRef('my_y', c_int()),
                    Cast(c_int(),
                         ArrayRef(SymbolRef('v'),
                                  Add(SymbolRef('x'),
                                      Mul(SymbolRef('y'), len_x))))),
                Assign(
                    SymbolRef('xfrac', c_float()),
                    Sub(ArrayRef(SymbolRef('u'),
                                 Add(SymbolRef('x'),
                                     Mul(len_x, SymbolRef('y')))),
                        SymbolRef('my_x'))),
                Assign(
                    SymbolRef('yfrac', c_float()),
                    Sub(ArrayRef(SymbolRef('v'),
                                 Add(SymbolRef('x'),
                                     Mul(len_x, SymbolRef('y')))),
                        SymbolRef('my_y'))),
                If(Lt(
                    ArrayRef(SymbolRef('u'),
                             Add(SymbolRef('x'),
                                 Mul(len_x, SymbolRef('y')))),
                    Constant(0.0)),
                    [
                        PostDec(SymbolRef('my_x')),
                        Assign(SymbolRef('xfrac'),
                               Add(Constant(1.0),
                                   SymbolRef('xfrac')))
                    ]),
                If(Lt(
                    ArrayRef(SymbolRef('v'),
                             Add(SymbolRef('x'),
                                 Mul(len_x, SymbolRef('y')))),
                    Constant(0.0)),
                    [
                        PostDec(SymbolRef('my_y')),
                        Assign(SymbolRef('yfrac'),
                               Add(Constant(1.0),
                                   SymbolRef('yfrac')))
                    ]),
                Assign(SymbolRef('tmp', c_float()), Constant(0.0)),
                If(
                    And(
                        And(
                            GtE(Add(SymbolRef('x'),
                                    SymbolRef('my_x')), Constant(0)),
                            Lt(Add(SymbolRef('x'),
                                   Add(SymbolRef('my_x'), Constant(1))), len_x)
                        ),
                        And(
                            GtE(Add(SymbolRef('y'),
                                    SymbolRef('my_y')), Constant(0)),
                            Lt(Add(SymbolRef('y'),
                                   Add(SymbolRef('my_y'), Constant(1))), len_y)
                        )
                    ),
                    [
                        AddAssign(
                            SymbolRef('tmp'),
                            Mul(
                                Mul(
                                    ArrayRef(
                                        SymbolRef('input'),
                                        Add(
                                            Add(
                                                SymbolRef('x'),
                                                SymbolRef('my_x')),
                                            Mul(
                                                len_x,
                                                Add(SymbolRef('y'),
                                                    SymbolRef('my_y'))))),
                                    Sub(Constant(1.0), SymbolRef('xfrac'))),
                                Sub(Constant(1.0), SymbolRef('yfrac')))),
                        AddAssign(
                            SymbolRef('tmp'),
                            Mul(
                                Mul(
                                    ArrayRef(
                                        SymbolRef('input'),
                                        Add(
                                            Add(
                                                Add(SymbolRef('x'),
                                                    SymbolRef('my_x')),
                                                Constant(1)),
                                            Mul(
                                                len_x,
                                                Add(SymbolRef('y'),
                                                    SymbolRef('my_y'))))),
                                    SymbolRef('xfrac')),
                                Sub(Constant(1.0), SymbolRef('yfrac')))),
                        AddAssign(
                            SymbolRef('tmp'),
                            Mul(
                                Mul(
                                    ArrayRef(
                                        SymbolRef('input'),
                                        Add(
                                            Add(
                                                SymbolRef('x'),
                                                SymbolRef('my_x')),
                                            Mul(
                                                len_x,
                                                Add(Add(SymbolRef('y'),
                                                        SymbolRef('my_y')),
                                                    Constant(1))))),
                                    Sub(Constant(1.0), SymbolRef('xfrac'))),
                                SymbolRef('yfrac'))),
                        AddAssign(
                            SymbolRef('tmp'),
                            Mul(
                                Mul(ArrayRef(
                                    SymbolRef('input'),
                                    Add(
                                        Add(
                                            Add(SymbolRef('x'),
                                                SymbolRef('my_x')),
                                            Constant(1)),
                                        Mul(
                                            len_x,
                                            Add(Add(SymbolRef('y'),
                                                    SymbolRef('my_y')),
                                                Constant(1))))),
                                    SymbolRef('xfrac')),
                                SymbolRef('yfrac'))),
                        ],
                    Assign(
                        SymbolRef('tmp'),
                        ArrayRef(
                            SymbolRef('input'),
                            Add(
                                FunctionCall(
                                    SymbolRef('clamp'),
                                    [
                                        Add(SymbolRef('x'), SymbolRef('my_x')),
                                        Constant(0),
                                        Sub(len_x, Constant(1))
                                    ]
                                ),
                                Mul(
                                    len_x,
                                    FunctionCall(SymbolRef('clamp'), [
                                        Add(SymbolRef('y'), SymbolRef('my_y')),
                                        Constant(0),
                                        Sub(len_y, Constant(1))
                                    ]
                                    ),
                                    )
                            )
                        )
                    )

                ),
                Assign(
                    ArrayRef(SymbolRef('output'),
                             Add(SymbolRef('x'),
                                 Mul(len_x, SymbolRef('y')))),
                    SymbolRef('tmp')
                )
            ]
        )

        body.set_kernel()
        kernel = OclFile("kernel", [body])
        return kernel
Exemplo n.º 14
0
    def transform(self, tree, program_config):
        dirname = self.config_to_dirname(program_config)
        A = program_config[0]
        len_A = np.prod(A.shape)
        data_type = get_c_type_from_numpy_dtype(A.dtype)        # Get the ctype class for the data type for the parameters
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree).find(FunctionDecl)
          
        apply_one.name = 'apply'                                # Naming our kernel method

        # Assigning a data_type instance for the  #
        # return type, and the parameter types... #
        apply_one.return_type = data_type()                     
        apply_one.params[0].type = data_type()
        apply_one.params[1].type = data_type()

        responsible_size = int(len_A / WORK_GROUP_SIZE)         # Get the appropriate number of threads for parallelizing
        
        # Creating our controller function (called "apply_kernel") to control #
        # the parallelizing of our computation, using ctree syntax...         #
        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("localData", pointer()).set_local()
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),
                                        Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())),
                                               ArrayRef(SymbolRef('A'), SymbolRef('globalId'))
                                               ),
                                        For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)),
                                            PostInc(SymbolRef('offset')),
                                            [
                                                Assign(SymbolRef('localResult'),
                                                       FunctionCall(apply_one.name, [SymbolRef('localResult'),
                                                                              ArrayRef(SymbolRef('A'),
                                                                                       Add(SymbolRef('globalId'),
                                                                                           Mul(SymbolRef('offset'),
                                                                                               Constant(WORK_GROUP_SIZE))))])
                                                       ),
                                            ]
                                        ),
                                            Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')),
                                                SymbolRef('localResult')
                                               ),
                                            barrier(CLK_LOCAL_MEM_FENCE()),
                                        If(Eq(SymbolRef('globalId'), Constant(0)),
                                           [
                                                Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'),
                                                                                                                   ArrayRef(SymbolRef('localData'),Constant(x))]))
                                                for x in range(1, WORK_GROUP_SIZE)
                                           ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))]
                                        )
                                    ]
        ).set_kernel()

        # Hardcoded OpenCL code to compensate to begin execution of parallelized computation 
        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $local;
            size_t local = $local;
            intptr_t len = $length;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
            clSetKernelArg(kernel, 2, local * sizeof(int), NULL);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2),
              'length': Constant(len_A),
        })

        ocl_kernel = OclFile("kernel", [apply_one, apply_kernel])
        c_controller = CFile("generated", [control])
        return [ocl_kernel, c_controller]
Exemplo n.º 15
0
    def transform(self, tree, program_config):
        A = program_config[0]
        len_A = np.prod(A.shape)
        inner_type = get_c_type_from_numpy_dtype(A.dtype)()
        pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape)
        apply_one = PyBasicConversions().visit(tree.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type
        apply_one.params[1].type = inner_type
        responsible_size = int(len_A / WORK_GROUP_SIZE)
        apply_kernel = FunctionDecl(None, "apply_kernel",
                                    params=[SymbolRef("A", pointer()).set_global(),
                                            SymbolRef("output_buf", pointer()).set_global(),
                                            SymbolRef("localData", pointer()).set_local()
                                    ],
                                    defn=[
                                        Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)),
                                        Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)),
                                        Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)),
                                        Assign(SymbolRef('localResult', ct.c_int()),
                                               ArrayRef(SymbolRef('A'), SymbolRef('globalId'))
                                               )
                                        ] +
                                        [Assign(SymbolRef('localResult'),
                                                FunctionCall(SymbolRef('apply'),
                                                             [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))]))
                                            for i in range(1, responsible_size)] +
                                        [
                                            Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')),
                                                SymbolRef('localResult')
                                               ),
                                            barrier(CLK_LOCAL_MEM_FENCE()),
                                        If(Eq(SymbolRef('globalId'), Constant(0)),
                                           [
                                                Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'),
                                                                                                                   ArrayRef(SymbolRef('localData'),Constant(x))]))
                                                for x in range(1, WORK_GROUP_SIZE)
                                           ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))]
                                        )
                                    ]
        ).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif

        #include <stdio.h>

        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) {
            size_t global = $local;
            size_t local = $local;
            intptr_t len = $length;
            cl_mem swap;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf);
            clSetKernelArg(kernel, 2, local * sizeof(int), NULL);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
        }
        """, {'local': Constant(WORK_GROUP_SIZE),
              'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2),
              'length': Constant(len_A)
        })

        c_controller = CFile("generated", [control])
        return [kernel, c_controller]