def gen_index_in_bounds_conditional(self, body, is_low_side=True): """ provide bounds checking if the virtual grid size differs from grid size :param body: :return: """ def is_conditional_required_for(d): if self.virtual_global_size[d] == self.global_size[d]: return False if d == self.dimension and is_low_side: return False return True conditional = None for dim in range(self.dimensions): if is_conditional_required_for(dim): if conditional is None: conditional = Lt(get_global_id(dim), Constant(self.global_size[dim])) else: conditional = And(conditional, Lt(get_global_id(dim), Constant(self.global_size[dim]))) if conditional is None: return body else: return If(conditional, body)
def visit_SymbolRef(self, node): if node.name == self.loop_var: index = get_global_id(self.ndim - 1) for d in reversed(range(self.ndim - 1)): index = Add( Mul( index, Constant(self.shape[d]) ), get_global_id(d) ) return index return node
def transform(self, tree, program_config): arg_cfg = program_config[0] self.entry_point = unique_kernel_name() ctypeObject = c_float() ctype = c_float len_x = arg_cfg[0][1][0] len_y = arg_cfg[0][1][1] output = unique_name() params = [ SymbolRef("input", POINTER(ctype)(), _global=True, _const=True), SymbolRef(output, POINTER(ctype)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), Assign(SymbolRef('temp', ctypeObject), Constant(0)), ]) body = \ """ temp = .5 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] if (x & 0x1): temp += .25 * input[clamp(x/2 + 1, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] else: temp += .25 * input[clamp(x/2 - 1, 0, (len_x / 2) - 1) * len_y + clamp(y/2, 0, (len_y / 2) - 1)] if (y & 0x1): temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) * len_y + clamp(y/2 + 1, 0, (len_y / 2) - 1)] else: temp += .25 * input[clamp(x/2, 0, (len_x / 2) - 1) *len_y + clamp(y/2 - 1, 0, (len_y / 2) - 1)] output[x * len_y + y] = temp """ body = ast.parse(body).body name_dict = { 'output': output } const_dict = { 'len_x': len_x, 'len_y': len_y, } transformation = PyBasicConversions(name_dict, const_dict) defn.extend(body) tree = FunctionDecl(None, self.entry_point, params, defn) tree.set_kernel() kernel = OclFile("kernel", [tree]) kernel = transformation.visit(kernel) return kernel
def gen_global_index(self): dim = self.output_grid.ndim index = get_global_id(dim - 1) for d in reversed(range(dim - 1)): stride = self.output_grid.strides[d] // \ self.output_grid.itemsize index = Add( index, Mul( get_global_id(d), Constant(stride) ) ) return index
def get_semantic_tree(self, arg, output_name): params = [ SymbolRef(self.array_name, POINTER(c_float)(), _global=True, _const=True), SymbolRef(arg.name, POINTER(c_float)(), _global=True, _const=True), SymbolRef(output_name, POINTER(c_float)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d)) for d in range(len(arg.data.shape)) ]) index = StringTemplate('element_id1 * $len_x + element_id0', {'len_x': Constant(arg.data.shape[1])}) defn.append( Assign( ArrayRef(SymbolRef(params[-1].name), index), self.original_tree( ArrayRef(SymbolRef(params[0].name), index), ArrayRef(SymbolRef(params[1].name), index), ) ) ) entry_point = unique_kernel_name() tree = FunctionDecl(None, entry_point, params, defn) tree.set_kernel() kernel = OclFile("kernel", [tree]) return kernel
def gen_global_index(self): dim = self.dimensions index = get_global_id(dim - 1) for d in reversed(range(dim - 1)): stride = self.grid.strides[d] // \ self.grid.itemsize index = Add( index, Mul( Add( get_global_id(d), Constant(self.global_offset[d]) ), Constant(stride) ) ) return index
def gen_global_index_with_halo_offset(self): dim = self.dimensions index = get_global_id(dim - 1) if dim - 1 == self.dimension: index = Add(index, Constant(self.shape[dim-1] - self.halo[dim-1])) for d in reversed(range(dim - 1)): stride = self.grid.strides[d] // \ self.grid.itemsize add_amount = Add(get_global_id(d), Constant(self.global_offset[d])) if d == self.dimension: add_amount = Add(add_amount, Constant(self.shape[self.dimension] - self.halo[self.dimension])) index = Add( index, Mul( add_amount, Constant(stride) ) ) return index
def transform(self, tree, program_config): # TODO: Have to flip indices, figure out why arg_cfg, tune_cfg = program_config output_name = unique_name() params = [ SymbolRef(self.array_name, POINTER(c_float)(), _global=True, _const=True), SymbolRef(arg_cfg[0][0], POINTER(c_float)(), _global=True, _const=True), SymbolRef(output_name, POINTER(c_float)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d)) for d in range(len(arg_cfg[0][2])) ]) index = StringTemplate('element_id1 * $len_x + element_id0', {'len_x': Constant(arg_cfg[0][2][1])}) defn.append( Assign( ArrayRef(SymbolRef(params[-1].name), index), tree( ArrayRef(SymbolRef(params[0].name), index), ArrayRef(SymbolRef(params[1].name), index), ) ) ) entry_point = unique_kernel_name() tree = FunctionDecl(None, entry_point, params, defn) tree.set_kernel() fn = ArrayOpConcrete(self.array, self.generate_output(output_name)) kernel = OclFile("kernel", [tree]) program = clCreateProgramWithSource( fn.context, kernel.codegen() ).build() ptr = program[entry_point] return fn.finalize(ptr, (arg_cfg[0][2][1], arg_cfg[0][2][0]))
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = A.dtype.type() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("len", ct.c_int()) ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), # getting the group id for this work group Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), # getting the global id for this work item Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), # getting the local id for this work item For(Assign(SymbolRef('i', ct.c_int()), Constant(1)), # for(int i=1; i<WORK_GROUP_SIZE; i *= 2) Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)), MulAssign(SymbolRef('i'), Constant(2)), [ If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))), # if statement checks Constant(0)), Lt(Add(SymbolRef('globalId'), SymbolRef('i')), SymbolRef("len"))), [ Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')), FunctionCall(SymbolRef('apply'), [ ArrayRef(SymbolRef('A'), SymbolRef('globalId')), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), SymbolRef('i'))) ])), ] ), FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')]) ] ), If(Eq(SymbolRef('localId'), Constant(0)), [ Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')), ArrayRef(SymbolRef('A'), SymbolRef('globalId'))) ] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $n; size_t local = $local; intptr_t len = $length; cl_mem swap; for (int runs = 0; runs < $run_limit ; runs++){ clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, sizeof(intptr_t), &len); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); swap = buf; buf = out_buf; out_buf = swap; len = len/local + (len % local != 0); } } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)), 'length': Constant(len_A), 'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE))) }) proj = Project([kernel, CFile("generated", [control])]) fn = ConcreteXorReduction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def transform(self, tree, program_config): # TODO: Have to flip indices, figure out why arg_cfg = program_config[0] input_name = arg_cfg[0][0] self.output_name = unique_name() params = [ SymbolRef( input_name, POINTER(c_float)(), _global=True, _const=True), SymbolRef(self.output_name, POINTER(c_float)(), _global=True) ] defn = [] defn.extend([ Assign(SymbolRef('element_id%d' % d, c_int()), get_global_id(d)) for d in range(len(arg_cfg[0][2])) ]) out_index = StringTemplate('element_id1 * $len_x + element_id0', {'len_x': Constant(arg_cfg[0][2][1])}) defn.append(Assign( ArrayRef(SymbolRef(self.output_name), out_index), Div( Add( ArrayRef( SymbolRef(input_name), StringTemplate( '(element_id1 * 2) * $len_x + (element_id0 * 2)', {'len_x': Constant(arg_cfg[0][2][1])}) ), Add( ArrayRef( SymbolRef(input_name), StringTemplate( '(element_id1 * 2) * $len_x + \ (element_id0 * 2 + 1)', {'len_x': Constant(arg_cfg[0][2][1])}) ), Add( ArrayRef( SymbolRef(input_name), StringTemplate( '(element_id1 * 2 + 1) * $len_x + \ (element_id0 * 2 + 1)', {'len_x': Constant(arg_cfg[0][2][1])}) ), Add( ArrayRef( SymbolRef(input_name), StringTemplate( '(element_id1 * 2 + 1) * $len_x + \ (element_id0 * 2)', {'len_x': Constant(arg_cfg[0][2][1])}) ), ) ) ) ), Constant(4.0) ) )) self.entry_point = unique_kernel_name() tree = FunctionDecl(None, self.entry_point, params, defn) tree.set_kernel() kernel = OclFile("kernel", [tree]) return kernel
def visit_InteriorPointsLoop(self, node): dim = len(self.output_grid.shape) self.kernel_target = node.target condition = And( Lt(get_global_id(0), Constant(self.arg_cfg[0].shape[0] - self.ghost_depth[0])), GtE(get_global_id(0), Constant(self.ghost_depth[0])) ) for d in range(1, len(self.arg_cfg[0].shape)): condition = And( condition, And( Lt(get_global_id(d), Constant(self.arg_cfg[0].shape[d] - self.ghost_depth[d])), GtE(get_global_id(d), Constant(self.ghost_depth[d])) ) ) body = [] self.macro_defns = [ CppDefine("local_array_macro", ["d%d" % i for i in range(dim)], self.gen_local_macro()), CppDefine("global_array_macro", ["d%d" % i for i in range(dim)], self.gen_global_macro()) ] body.extend(self.macro_defns) global_idx = 'global_index' self.output_index = global_idx body.append(Assign(SymbolRef('global_index', ct.c_int()), self.gen_global_index())) self.load_mem_block = self.load_shared_memory_block( self.local_block, self.ghost_depth) body.extend(self.load_mem_block) body.append(FunctionCall(SymbolRef("barrier"), [SymbolRef("CLK_LOCAL_MEM_FENCE")])) for d in range(0, dim): body.append(Assign(SymbolRef('local_id%d' % d, ct.c_int()), Add(get_local_id(d), Constant(self.ghost_depth[d])))) self.var_list.append("local_id%d" % d) for child in map(self.visit, node.body): if isinstance(child, list): self.stencil_op.extend(child) else: self.stencil_op.append(child) conditional = None for dim in range(len(self.output_grid.shape)): if self.virtual_global_size[dim] != self.global_size[dim]: if conditional is None: conditional = Lt(get_global_id(dim), Constant(self.global_size[dim])) else: conditional = And(conditional, Lt(get_global_id(dim), Constant(self.global_size[dim]))) if conditional is not None: body.append(If(conditional, self.stencil_op)) else: body.extend(self.stencil_op) # this does not help fix the failure # body.append(FunctionCall(SymbolRef("barrier"), # [SymbolRef("CLK_GLOBAL_MEM_FENCE")])) # body.extend(self.stencil_op) # # this line does seem to fix the problem, seems to suggest some timing # issue # # body.append(If(conditional, # [StringTemplate("out_grid[global_index]+=0;")])) # # the following fixes the problem too, suggests timing issues # # body.append(FunctionCall(SymbolRef("printf"), [String("gid %d\\n"), # SymbolRef("global_index")])) # from ctree.ocl.macros import get_group_id # body.append( # FunctionCall( # SymbolRef("printf"), # [ # String("group_id %2d %2d gid %2d %2d %2d\\n"), # get_global_id(0), # get_group_id(1), # get_global_id(0), # get_global_id(1), # SymbolRef('global_index'), # ] # ) # ) return body
def transform(self, tree, program_config): call_args = program_config[0] base_size = call_args.base_shape[0] * call_args.base_shape[1] border = call_args.border c_float_type = c_float c_int_type = c_int transformer = PyBasicConversions() output = unique_name() init_entry_point = unique_kernel_name() init_params = [ SymbolRef('input', POINTER(c_float_type)(), _global=True, _const=True), SymbolRef(output, POINTER(c_float_type)(), _global=True), ] init_defn = [] init_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{output}[y * {len_x} + x] = input[y * {len_x} + x]""".format( output=output, len_x=call_args.base_shape[0] ) print(body) tree_body = ast.parse(body).body init_defn.extend(tree_body) init_tree = FunctionDecl(None, init_entry_point, init_params, init_defn) init_tree.set_kernel() init_kernel = OclFile('kernel', [init_tree]) init_kernel = transformer.visit(init_kernel) print("init kernel codegen") print(init_kernel.codegen()) compute_entry_point = unique_kernel_name() compute_params = [ SymbolRef(output, POINTER(c_float_type)(), _global=True), SymbolRef('power', c_int(), _const=True), ] compute_defn = [] compute_defn.extend([ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), ]) body = """{matrix}[(power+1) * {base_size} + y * {len_x} + x] = 0.1 * {matrix}[ power * {base_size} + clamp(y-1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.1 * {matrix}[ power * {base_size} + clamp(y+1, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x-1, {border}, {len_x}-{border}-1) ] + 0.4 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x+1, {border}, {len_x}-{border}-1) ] + 1.0 * {matrix}[ power * {base_size} + clamp(y, {border}, {len_y}-{border}-1) * {len_x} + clamp(x, {border}, {len_x}-{border}-1) ] """.format( matrix=output, base_size=base_size, len_y=call_args.base_shape[0], len_x=call_args.base_shape[1], border=border, ) body = re.sub("""\s\s*""", " ", body) print(body) tree_body = ast.parse(body).body compute_defn.extend(tree_body) compute_tree = FunctionDecl(None, compute_entry_point, compute_params, compute_defn) compute_tree.set_kernel() compute_kernel = OclFile('kernel', [compute_tree]) compute_kernel = transformer.visit(compute_kernel) print("compute kernel codegen") print(compute_kernel.codegen()) fn = OclMatrixPowers() init_program = clCreateProgramWithSource(fn.context, init_kernel.codegen()).build() init_ptr = init_program[init_entry_point] compute_program = clCreateProgramWithSource(fn.context, compute_kernel.codegen()).build() compute_ptr = compute_program[compute_entry_point] return fn.finalize(init_ptr, compute_ptr, (call_args.base_shape[1], call_args.base_shape[0]))
def transform(self, tree, program_config): arg_cfg = program_config[0] len_x = Constant(arg_cfg[0][1][0]) len_y = Constant(arg_cfg[0][1][1]) self.entry_point = unique_kernel_name() body = FunctionDecl( None, self.entry_point, [ SymbolRef('input', POINTER(c_float)(), _global=True, _const=True), SymbolRef('u', POINTER(c_float)(), _global=True, _const=True), SymbolRef('v', POINTER(c_float)(), _global=True, _const=True), SymbolRef('output', POINTER(c_float)(), _global=True) ], [ Assign(SymbolRef('x', c_int()), get_global_id(0)), Assign(SymbolRef('y', c_int()), get_global_id(1)), Assign( SymbolRef('my_x', c_int()), Cast(c_int(), ArrayRef(SymbolRef('u'), Add(SymbolRef('x'), Mul(SymbolRef('y'), len_x))))), Assign( SymbolRef('my_y', c_int()), Cast(c_int(), ArrayRef(SymbolRef('v'), Add(SymbolRef('x'), Mul(SymbolRef('y'), len_x))))), Assign( SymbolRef('xfrac', c_float()), Sub(ArrayRef(SymbolRef('u'), Add(SymbolRef('x'), Mul(len_x, SymbolRef('y')))), SymbolRef('my_x'))), Assign( SymbolRef('yfrac', c_float()), Sub(ArrayRef(SymbolRef('v'), Add(SymbolRef('x'), Mul(len_x, SymbolRef('y')))), SymbolRef('my_y'))), If(Lt( ArrayRef(SymbolRef('u'), Add(SymbolRef('x'), Mul(len_x, SymbolRef('y')))), Constant(0.0)), [ PostDec(SymbolRef('my_x')), Assign(SymbolRef('xfrac'), Add(Constant(1.0), SymbolRef('xfrac'))) ]), If(Lt( ArrayRef(SymbolRef('v'), Add(SymbolRef('x'), Mul(len_x, SymbolRef('y')))), Constant(0.0)), [ PostDec(SymbolRef('my_y')), Assign(SymbolRef('yfrac'), Add(Constant(1.0), SymbolRef('yfrac'))) ]), Assign(SymbolRef('tmp', c_float()), Constant(0.0)), If( And( And( GtE(Add(SymbolRef('x'), SymbolRef('my_x')), Constant(0)), Lt(Add(SymbolRef('x'), Add(SymbolRef('my_x'), Constant(1))), len_x) ), And( GtE(Add(SymbolRef('y'), SymbolRef('my_y')), Constant(0)), Lt(Add(SymbolRef('y'), Add(SymbolRef('my_y'), Constant(1))), len_y) ) ), [ AddAssign( SymbolRef('tmp'), Mul( Mul( ArrayRef( SymbolRef('input'), Add( Add( SymbolRef('x'), SymbolRef('my_x')), Mul( len_x, Add(SymbolRef('y'), SymbolRef('my_y'))))), Sub(Constant(1.0), SymbolRef('xfrac'))), Sub(Constant(1.0), SymbolRef('yfrac')))), AddAssign( SymbolRef('tmp'), Mul( Mul( ArrayRef( SymbolRef('input'), Add( Add( Add(SymbolRef('x'), SymbolRef('my_x')), Constant(1)), Mul( len_x, Add(SymbolRef('y'), SymbolRef('my_y'))))), SymbolRef('xfrac')), Sub(Constant(1.0), SymbolRef('yfrac')))), AddAssign( SymbolRef('tmp'), Mul( Mul( ArrayRef( SymbolRef('input'), Add( Add( SymbolRef('x'), SymbolRef('my_x')), Mul( len_x, Add(Add(SymbolRef('y'), SymbolRef('my_y')), Constant(1))))), Sub(Constant(1.0), SymbolRef('xfrac'))), SymbolRef('yfrac'))), AddAssign( SymbolRef('tmp'), Mul( Mul(ArrayRef( SymbolRef('input'), Add( Add( Add(SymbolRef('x'), SymbolRef('my_x')), Constant(1)), Mul( len_x, Add(Add(SymbolRef('y'), SymbolRef('my_y')), Constant(1))))), SymbolRef('xfrac')), SymbolRef('yfrac'))), ], Assign( SymbolRef('tmp'), ArrayRef( SymbolRef('input'), Add( FunctionCall( SymbolRef('clamp'), [ Add(SymbolRef('x'), SymbolRef('my_x')), Constant(0), Sub(len_x, Constant(1)) ] ), Mul( len_x, FunctionCall(SymbolRef('clamp'), [ Add(SymbolRef('y'), SymbolRef('my_y')), Constant(0), Sub(len_y, Constant(1)) ] ), ) ) ) ) ), Assign( ArrayRef(SymbolRef('output'), Add(SymbolRef('x'), Mul(len_x, SymbolRef('y')))), SymbolRef('tmp') ) ] ) body.set_kernel() kernel = OclFile("kernel", [body]) return kernel
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = get_c_type_from_numpy_dtype(A.dtype)() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type responsible_size = int(len_A / WORK_GROUP_SIZE) apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', ct.c_int()), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ) ] + [Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))])) for i in range(1, responsible_size)] + [ Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; cl_mem swap; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A) }) c_controller = CFile("generated", [control]) return [kernel, c_controller]