def gen_decls(dim, ghost_depth): thread_id = get_local_id(dim - 1) num_threads = get_local_size(dim - 1) block_size = Add( get_local_size(dim - 1), Constant(ghost_depth[dim - 1] * 2) ) for d in reversed(range(0, dim - 1)): base = get_local_size(dim - 1) for s in range(d, dim - 2): base = Mul(get_local_size(s + 1), base) thread_id = Add( Mul(get_local_id(d), base), thread_id ) num_threads = Mul(get_local_size(d), num_threads) block_size = Mul( Add(get_local_size(d), Constant(ghost_depth[d] * 2)), block_size ) return thread_id, num_threads, block_size
def local_array_macro(self, point): dim = len(self.output_grid.shape) index = get_local_id(dim) for d in reversed(range(dim)): index = Add( Mul( index, Add( get_local_size(d), Constant(2 * self.ghost_depth[d]) ), ), point[d] ) return FunctionCall(SymbolRef("local_array_macro"), point)
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = A.dtype.type() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("len", ct.c_int()) ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), # getting the group id for this work group Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), # getting the global id for this work item Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), # getting the local id for this work item For(Assign(SymbolRef('i', ct.c_int()), Constant(1)), # for(int i=1; i<WORK_GROUP_SIZE; i *= 2) Lt(SymbolRef('i'), Constant(WORK_GROUP_SIZE)), MulAssign(SymbolRef('i'), Constant(2)), [ If(And(Eq(Mod(SymbolRef('globalId'), Mul(SymbolRef('i'), Constant(2))), # if statement checks Constant(0)), Lt(Add(SymbolRef('globalId'), SymbolRef('i')), SymbolRef("len"))), [ Assign(ArrayRef(SymbolRef('A'), SymbolRef('globalId')), FunctionCall(SymbolRef('apply'), [ ArrayRef(SymbolRef('A'), SymbolRef('globalId')), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), SymbolRef('i'))) ])), ] ), FunctionCall(SymbolRef('barrier'), [SymbolRef('CLK_LOCAL_MEM_FENCE')]) ] ), If(Eq(SymbolRef('localId'), Constant(0)), [ Assign(ArrayRef(SymbolRef('output_buf'), SymbolRef('groupId')), ArrayRef(SymbolRef('A'), SymbolRef('globalId'))) ] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $n; size_t local = $local; intptr_t len = $length; cl_mem swap; for (int runs = 0; runs < $run_limit ; runs++){ clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, sizeof(intptr_t), &len); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); swap = buf; buf = out_buf; out_buf = swap; len = len/local + (len % local != 0); } } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant(len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE)), 'length': Constant(len_A), 'run_limit': Constant(ceil(log(len_A, WORK_GROUP_SIZE))) }) proj = Project([kernel, CFile("generated", [control])]) fn = ConcreteXorReduction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def visit_InteriorPointsLoop(self, node): dim = len(self.output_grid.shape) self.kernel_target = node.target condition = And( Lt(get_global_id(0), Constant(self.arg_cfg[0].shape[0] - self.ghost_depth[0])), GtE(get_global_id(0), Constant(self.ghost_depth[0])) ) for d in range(1, len(self.arg_cfg[0].shape)): condition = And( condition, And( Lt(get_global_id(d), Constant(self.arg_cfg[0].shape[d] - self.ghost_depth[d])), GtE(get_global_id(d), Constant(self.ghost_depth[d])) ) ) body = [] self.macro_defns = [ CppDefine("local_array_macro", ["d%d" % i for i in range(dim)], self.gen_local_macro()), CppDefine("global_array_macro", ["d%d" % i for i in range(dim)], self.gen_global_macro()) ] body.extend(self.macro_defns) global_idx = 'global_index' self.output_index = global_idx body.append(Assign(SymbolRef('global_index', ct.c_int()), self.gen_global_index())) self.load_mem_block = self.load_shared_memory_block( self.local_block, self.ghost_depth) body.extend(self.load_mem_block) body.append(FunctionCall(SymbolRef("barrier"), [SymbolRef("CLK_LOCAL_MEM_FENCE")])) for d in range(0, dim): body.append(Assign(SymbolRef('local_id%d' % d, ct.c_int()), Add(get_local_id(d), Constant(self.ghost_depth[d])))) self.var_list.append("local_id%d" % d) for child in map(self.visit, node.body): if isinstance(child, list): self.stencil_op.extend(child) else: self.stencil_op.append(child) conditional = None for dim in range(len(self.output_grid.shape)): if self.virtual_global_size[dim] != self.global_size[dim]: if conditional is None: conditional = Lt(get_global_id(dim), Constant(self.global_size[dim])) else: conditional = And(conditional, Lt(get_global_id(dim), Constant(self.global_size[dim]))) if conditional is not None: body.append(If(conditional, self.stencil_op)) else: body.extend(self.stencil_op) # this does not help fix the failure # body.append(FunctionCall(SymbolRef("barrier"), # [SymbolRef("CLK_GLOBAL_MEM_FENCE")])) # body.extend(self.stencil_op) # # this line does seem to fix the problem, seems to suggest some timing # issue # # body.append(If(conditional, # [StringTemplate("out_grid[global_index]+=0;")])) # # the following fixes the problem too, suggests timing issues # # body.append(FunctionCall(SymbolRef("printf"), [String("gid %d\\n"), # SymbolRef("global_index")])) # from ctree.ocl.macros import get_group_id # body.append( # FunctionCall( # SymbolRef("printf"), # [ # String("group_id %2d %2d gid %2d %2d %2d\\n"), # get_global_id(0), # get_group_id(1), # get_global_id(0), # get_global_id(1), # SymbolRef('global_index'), # ] # ) # ) return body
def transform(self, tree, program_config): dirname = self.config_to_dirname(program_config) A = program_config[0] len_A = np.prod(A.shape) data_type = get_c_type_from_numpy_dtype(A.dtype) # Get the ctype class for the data type for the parameters pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree).find(FunctionDecl) apply_one.name = 'apply' # Naming our kernel method # Assigning a data_type instance for the # # return type, and the parameter types... # apply_one.return_type = data_type() apply_one.params[0].type = data_type() apply_one.params[1].type = data_type() responsible_size = int(len_A / WORK_GROUP_SIZE) # Get the appropriate number of threads for parallelizing # Creating our controller function (called "apply_kernel") to control # # the parallelizing of our computation, using ctree syntax... # apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', (ct.c_int() if A.dtype is np.int32 else ct.c_float())), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ), For(Assign(SymbolRef('offset', ct.c_int()), Constant(1)), Lt(SymbolRef('offset'), Constant(responsible_size)), PostInc(SymbolRef('offset')), [ Assign(SymbolRef('localResult'), FunctionCall(apply_one.name, [SymbolRef('localResult'), ArrayRef(SymbolRef('A'), Add(SymbolRef('globalId'), Mul(SymbolRef('offset'), Constant(WORK_GROUP_SIZE))))]) ), ] ), Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef(apply_one.name), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() # Hardcoded OpenCL code to compensate to begin execution of parallelized computation control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A), }) ocl_kernel = OclFile("kernel", [apply_one, apply_kernel]) c_controller = CFile("generated", [control]) return [ocl_kernel, c_controller]
def transform(self, tree, program_config): A = program_config[0] len_A = np.prod(A.shape) inner_type = get_c_type_from_numpy_dtype(A.dtype)() pointer = np.ctypeslib.ndpointer(A.dtype, A.ndim, A.shape) apply_one = PyBasicConversions().visit(tree.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_one.params[1].type = inner_type responsible_size = int(len_A / WORK_GROUP_SIZE) apply_kernel = FunctionDecl(None, "apply_kernel", params=[SymbolRef("A", pointer()).set_global(), SymbolRef("output_buf", pointer()).set_global(), SymbolRef("localData", pointer()).set_local() ], defn=[ Assign(SymbolRef('groupId', ct.c_int()), get_group_id(0)), Assign(SymbolRef('globalId', ct.c_int()), get_global_id(0)), Assign(SymbolRef('localId', ct.c_int()), get_local_id(0)), Assign(SymbolRef('localResult', ct.c_int()), ArrayRef(SymbolRef('A'), SymbolRef('globalId')) ) ] + [Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('A'),Add(SymbolRef('globalId'), Constant(i * WORK_GROUP_SIZE)))])) for i in range(1, responsible_size)] + [ Assign(ArrayRef(SymbolRef('localData'), SymbolRef('globalId')), SymbolRef('localResult') ), barrier(CLK_LOCAL_MEM_FENCE()), If(Eq(SymbolRef('globalId'), Constant(0)), [ Assign(SymbolRef('localResult'), FunctionCall(SymbolRef('apply'), [SymbolRef('localResult'), ArrayRef(SymbolRef('localData'),Constant(x))])) for x in range(1, WORK_GROUP_SIZE) ] + [Assign(ArrayRef(SymbolRef('output_buf'), Constant(0)), SymbolRef('localResult'))] ) ] ).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate(r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <stdio.h> void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf, cl_mem out_buf) { size_t global = $local; size_t local = $local; intptr_t len = $length; cl_mem swap; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_buf); clSetKernelArg(kernel, 2, local * sizeof(int), NULL); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'local': Constant(WORK_GROUP_SIZE), 'n': Constant((len_A + WORK_GROUP_SIZE - (len_A % WORK_GROUP_SIZE))/2), 'length': Constant(len_A) }) c_controller = CFile("generated", [control]) return [kernel, c_controller]