def visit_FunctionDef(self, node): if node.name == 'kernel': node.args.args = node.args.args[1:] if self.arg_names is not None: # pragma no cover for index, arg in enumerate(node.args.args): new_name = self.arg_names[index] if sys.version_info >= (3, 0): self.arg_name_map[arg.arg] = new_name else: self.arg_name_map[arg.id] = new_name arg.id = new_name else: for index, arg in enumerate(node.args.args): # pragma no cover name = SymbolRef.unique().name if sys.version_info >= (3, 0): self.arg_name_map[arg.arg] = name arg.arg = name else: self.arg_name_map[arg.id] = name arg.id = name return super(PythonToStencilModel, self).visit_FunctionDef(node)
def visit_FunctionDecl(self, node): # This function grabs the input and output grid names which are used to self.local_block = SymbolRef.unique() # generate the proper array macros. arg_cfg = self.arg_cfg global_size = arg_cfg[0].shape if self.testing: local_size = (1, 1, 1) else: desired_device_number = -1 device = cl.clGetDeviceIDs()[desired_device_number] lcs = LocalSizeComputer(global_size, device) local_size = lcs.compute_local_size_bulky() virtual_global_size = lcs.compute_virtual_global_size(local_size) self.global_size = global_size self.local_size = local_size self.virtual_global_size = virtual_global_size super(StencilOclTransformer, self).visit_FunctionDecl(node) for index, param in enumerate(node.params[:-1]): # TODO: Transform numpy type to ctype param.type = ct.POINTER(ct.c_float)() param.set_global() param.set_const() node.set_kernel() node.params[-1].set_global() node.params[-1].type = ct.POINTER(ct.c_float)() node.params.append(SymbolRef(self.local_block.name, ct.POINTER(ct.c_float)())) node.params[-1].set_local() node.defn = node.defn[0] # if boundary handling is copy we have to generate a collection of # boundary kernels to handle the on-gpu boundary copy if self.is_copied: device = cl.clGetDeviceIDs()[-1] self.boundary_handlers = boundary_kernel_factory( self.ghost_depth, self.output_grid, node.params[0].name, node.params[-2].name, # second last parameter is output device ) boundary_kernels = [ FunctionDecl( name=boundary_handler.kernel_name, params=node.params, defn=boundary_handler.generate_ocl_kernel_body(), ) for boundary_handler in self.boundary_handlers ] self.project.files.append(OclFile('kernel', [node])) for dim, boundary_kernel in enumerate(boundary_kernels): boundary_kernel.set_kernel() self.project.files.append(OclFile(kernel_dim_name(dim), [boundary_kernel])) self.boundary_kernels = boundary_kernels # ctree.browser_show_ast(node) # import ctree # ctree.browser_show_ast(boundary_kernels[0]) else: self.project.files.append(OclFile('kernel', [node])) # print(self.project.files[0]) # print(self.project.files[-1]) defn = [ ArrayDef( SymbolRef('global', ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.virtual_global_size] ), ArrayDef( SymbolRef('local', ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in local_size] # [Constant(s) for s in [512, 512]] # use this line to force a # opencl local size error ), Assign(SymbolRef("error_code", ct.c_int()), Constant(0)), ] setargs = [clSetKernelArg( SymbolRef('kernel'), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] from functools import reduce import operator local_mem_size = reduce( operator.mul, (size + 2 * self.kernel.ghost_depth[index] for index, size in enumerate(local_size)), ct.sizeof(cl.cl_float()) ) setargs.append( clSetKernelArg( 'kernel', len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef('kernel'), Constant(self.kernel.dim), NULL(), SymbolRef('global'), SymbolRef('local'), Constant(0), NULL(), NULL() ]) defn.extend(check_ocl_error(enqueue_call, "clEnqueueNDRangeKernel")) params = [ SymbolRef('queue', cl.cl_command_queue()), SymbolRef('kernel', cl.cl_kernel()) ] if self.is_copied: for dim, boundary_kernel in enumerate(self.boundary_kernels): defn.extend([ ArrayDef( SymbolRef(global_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.boundary_handlers[dim].global_size] ), ArrayDef( SymbolRef(local_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in self.boundary_handlers[dim].local_size] ) ]) setargs = [clSetKernelArg( SymbolRef(kernel_dim_name(dim)), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] setargs.append( clSetKernelArg( SymbolRef(kernel_dim_name(dim)), len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall( SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef(kernel_dim_name(dim)), Constant(self.kernel.dim), NULL(), SymbolRef(global_for_dim_name(dim)), SymbolRef(local_for_dim_name(dim)), Constant(0), NULL(), NULL() ] ) defn.append(enqueue_call) params.extend([ SymbolRef(kernel_dim_name(dim), cl.cl_kernel()) ]) # finish_call = FunctionCall(SymbolRef('clFinish'), # [SymbolRef('queue')]) # defn.append(finish_call) # finish_call = [ # Assign( # SymbolRef("error_code", ct.c_int()), # FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]) # ), # If( # NotEq(SymbolRef("error_code"), Constant(0)), # FunctionCall( # SymbolRef("printf"), # [ # String("OPENCL KERNEL RETURNED ERROR CODE %d"), # SymbolRef("error_code") # ] # ) # ) # ] finish_call = check_ocl_error( FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]), "clFinish" ) defn.extend(finish_call) defn.append(Return(SymbolRef("error_code"))) params.extend(SymbolRef('buf%d' % d, cl.cl_mem()) for d in range(len(arg_cfg) + 1)) control = FunctionDecl(ct.c_int32(), "stencil_control", params=params, defn=defn) return control