def generate_body(self, kernel, codegen_state): from cgen import Block body = Block() # {{{ declare temporaries body.extend( idi.cgen_declarator for tv in six.itervalues(kernel.temporary_variables) for idi in tv.decl_info(kernel.target, is_written=True, index_dtype=kernel.index_dtype) ) # }}} from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) from cgen import Line body.append(Line()) if isinstance(gen_code.ast, Block): body.extend(gen_code.ast.contents) else: body.append(gen_code.ast) return body, gen_code.implemented_domains
def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial from loopy.codegen.control import build_loop_nest if codegen_state.is_generating_device_code: from loopy.schedule import CallKernel assert isinstance(codegen_state.kernel.linearization[schedule_index], CallKernel) from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( codegen_state, schedule_index, next_func=partial(build_loop_nest, schedule_index=schedule_index + 1)) else: codegen_result = build_loop_nest(codegen_state, schedule_index) if (codegen_state.is_generating_device_code or codegen_state.is_entrypoint): codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) + temp_decls + [codegen_result], collapse=False) cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, schedule_index, fdecl_ast, body_ast) codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( ast=ast_builder.process_ast(fdef_ast), body_ast=ast_builder.process_ast(body_ast))) return codegen_result
def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial from loopy.codegen.control import build_loop_nest if codegen_state.is_generating_device_code: from loopy.schedule import CallKernel assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel) from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( codegen_state, schedule_index, next_func=partial(build_loop_nest, schedule_index=schedule_index + 1)) else: codegen_result = build_loop_nest(codegen_state, schedule_index) codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) + temp_decls + [codegen_result], collapse=False) cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( codegen_state, codegen_result, schedule_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, schedule_index, fdecl_ast, body_ast) codegen_result = codegen_result.with_new_program( codegen_state, cur_prog.copy( ast=fdef_ast, body_ast=body_ast)) return codegen_result
def generate_body(self, kernel, codegen_state): from cgen import Block body = Block() temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_is_local = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute from loopy.codegen import POD # uses the correct complex type class ConstRestrictPointer(Pointer): def get_decl_pair(self): sub_tp, sub_decl = self.subdecl.get_decl_pair() return sub_tp, ("*const restrict %s" % sub_decl) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: temp_var_decl = POD(self, idi.dtype, idi.name) if idi.shape: temp_var_decl = ArrayOf(temp_var_decl, " * ".join(str(s) for s in idi.shape)) temp_decls.append( self.wrap_temporary_decl(temp_var_decl, tv.is_local)) else: offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_is_local.setdefault(tv.base_storage, []).append( tv.is_local) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.is_local) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = ConstRestrictPointer(cast_decl) temp_var_decl = ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = POD(self, np.int8, bs_name) bs_var_decl = self.wrap_temporary_decl( bs_var_decl, base_storage_to_is_local[bs_name]) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) body.append(bs_var_decl) body.extend(temp_decls) # }}} from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) from cgen import Line body.append(Line()) if isinstance(gen_code.ast, Block): body.extend(gen_code.ast.contents) else: body.append(gen_code.ast) return body, gen_code.implemented_domains