def visit_FunctionDecl(self, node): super(StencilCTransformer, self).visit_FunctionDecl(node) for index, arg in enumerate(self.input_grids + (self.output_grid, )): defname = "_%s_array_macro" % node.params[index].name params = ','.join(["_d" + str(x) for x in range(arg.ndim)]) params = "(%s)" % params calc = "((_d%d)" % (arg.ndim - 1) for x in range(arg.ndim - 1): ndim = str(int(arg.strides[x] / arg.itemsize)) calc += "+((_d%s) * %s)" % (str(x), ndim) calc += ")" params = ["_d" + str(x) for x in range(arg.ndim)] node.defn.insert(0, CppDefine(defname, params, calc)) abs_decl = FunctionDecl(c_int(), SymbolRef('abs'), [SymbolRef('n', c_int())]) macro = CppDefine( "min", [SymbolRef('_a'), SymbolRef('_b')], TernaryOp(Lt(SymbolRef('_a'), SymbolRef('_b')), SymbolRef('_a'), SymbolRef('_b'))) node.params.append(SymbolRef('duration', POINTER(c_float))) start_time = Assign(StringTemplate('clock_t start_time'), FunctionCall(SymbolRef('clock'))) node.defn.insert(0, start_time) end_time = Assign( Deref(SymbolRef('duration')), Div(Sub(FunctionCall(SymbolRef('clock')), SymbolRef('start_time')), SymbolRef('CLOCKS_PER_SEC'))) node.defn.append(end_time) return [StringTemplate("#include <time.h>"), abs_decl, macro, node]
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value kernel_name = self._gen_unique_kernel_name() kernel_src = StringTemplate(""" __kernel void $kernel_name(__global float * $arr) { int x = get_global_id(0); float sum = $arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += $arr[i * $size + x]; } $arr[x] = sum; } """, {'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'size': C.Constant(size), 'kernel_name': C.SymbolRef(kernel_name)}) program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] self.kernels[kernel_name] = kernel kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem)) return StringTemplate( """ size_t global_size_{kernel_name}[1] = {{{looplen1}}}; clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL); clFinish(queue); """.format( kernel_name=kernel_name, looplen1=size) )
def transform(self, py_ast, program_cfg): arg_cfg, tune_cfg = program_cfg tree = PyBasicConversions().visit(py_ast) param_dict = {} tree.body[0].params.append(C.SymbolRef("retval", arg_cfg[0]())) # Annotate arguments for param, type in zip(tree.body[0].params, arg_cfg): param.type = type() param_dict[param.name] = type._dtype_ length = np.prod(arg_cfg[0]._shape_) transformer = MapTransformer("i", param_dict, "retval") body = list(map(transformer.visit, tree.body[0].defn)) tree.body[0].defn = [C.For( C.Assign(C.SymbolRef("i", ct.c_int()), C.Constant(0)), C.Lt(C.SymbolRef("i"), C.Constant(length)), C.PostInc(C.SymbolRef("i")), body=body, pragma="ivdep" )] tree = DeclarationFiller().visit(tree) defns = [] tree = HwachaVectorize(param_dict, defns).visit(tree) file_body = [ StringTemplate("#include <stdlib.h>"), StringTemplate("#include <stdint.h>"), StringTemplate("#include <assert.h>"), StringTemplate("extern \"C\" void __hwacha_body(void);"), ] file_body.extend(defns) file_body.append(tree) return [CFile("generated", file_body)]
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value return StringTemplate(""" //{ // ContinueNode *$reduce_node = new ContinueNode(&graph, [=]() { parallel_for(0,$size, [=](int low, int high) { #pragma simd for (int x = low; x < high; ++x) { float sum = _$arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += _$arr[i * $size + x]; } _$arr[x] = sum; } }); // }); // for (int i = 0; i < $looplen1; i+=$loopincr) { // make_edge($node_list[i], $reduce_node); // } //}; """, {'size': C.Constant(size), 'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'node_list': C.SymbolRef("node_list_"), 'reduce_node': C.SymbolRef("reduce_node_"), 'looplen1': C.Constant(looplen1.value), 'loopincr': C.Constant(loopincr) })
def insert_malloc(body, shape, name, dtype, _global=False): shape_str = "".join("[{}]".format(d) for d in shape[1:]) size = 1 for d in shape: size *= d body.insert( 0, StringTemplate( "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;", { "arg_name0": C.SymbolRef(name), "arg_name1": C.FunctionCall(C.SymbolRef('_mm_malloc'), [ C.Mul( C.Constant(size), C.FunctionCall(C.SymbolRef('sizeof'), [ ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype) ()) ])), C.Constant(64) ]), "shape": C.SymbolRef(shape_str), "cast": C.SymbolRef(shape_str), "type": C.SymbolRef( ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype)())), "global": C.SymbolRef("__global " if _global else "") }))
def test_child_nested_list(self): d = {'stmts': [[Constant(1)], Constant(2)]} t = """$stmts""" tree = StringTemplate(t, d) self._check(tree, """\ 1; 2;""")
def test_child_single_list(self): d = { 'stmts': [Constant(1)] } t = """$stmts""" tree = StringTemplate(t, d) self._check(tree, "1")
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ tune_cfg = program_config[1] x, y = tune_cfg['x'], tune_cfg['y'] template_args = { 'min_x': Constant(self.min_x), 'min_y': Constant(self.min_y), 'x': Constant(x), 'y': Constant(y), } cfile = CFile("generated", [ StringTemplate( """\ // paraboloid with global min of 1 at ($min_x, $min_y) float get_height() { return ($x-$min_x)*($x-$min_x) + ($y-$min_y)*($y-$min_y) + 1.0; }""", template_args) ]) entry_typesig = FuncType(Float(), []) return Project([cfile]), entry_typesig.as_ctype()
def insert_free(name): return (StringTemplate( "$free;", { "free": C.FunctionCall(C.SymbolRef('_mm_free'), [C.SymbolRef(name)]), }))
def _gen_k_rank1_updates(self, rx, ry, cx, cy, unroll, lda): stmts = [StringTemplate("// do K rank-1 updates")] for i in range(ry / 4): stmts.append(SymbolRef("a%d" % i, m256d())) stmts.append(SymbolRef("b", m256d())) stmts.extend( self._gen_rank1_update(i, rx, ry, cx, cy, lda) for i in range(unroll)) return Block(stmts)
def scalar_init(scalar): name = "".join(number_dict[digit] for digit in str(scalar)) return StringTemplate(""" union {{ float f; uint32_t i; }} {name}; {name}.f = {scalar}f; """.format(name=name, scalar=scalar))
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ A = program_config[0] len_A = np.prod(A._shape_) inner_type = A._dtype_.type() # browser_show_ast(py_ast,'tmp.png') apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.return_type = inner_type apply_one.params[0].type = inner_type apply_kernel = FunctionDecl( None, "apply_kernel", params=[SymbolRef("A", A()).set_global()], defn=[ Assign(SymbolRef("i", ct.c_int()), get_global_id(0)), If(Lt(SymbolRef("i"), Constant(len_A)), [ Assign( ArrayRef(SymbolRef("A"), SymbolRef("i")), FunctionCall( SymbolRef("apply"), [ArrayRef(SymbolRef("A"), SymbolRef("i"))])), ], []), ]).set_kernel() kernel = OclFile("kernel", [apply_one, apply_kernel]) control = StringTemplate( r""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf) { size_t global = $n; size_t local = 32; clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); } """, {'n': Constant(len_A + 32 - (len_A % 32))}) proj = Project([kernel, CFile("generated", [control])]) fn = OpFunction() program = cl.clCreateProgramWithSource(fn.context, kernel.codegen()).build() apply_kernel_ptr = program['apply_kernel'] entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel, cl.cl_mem) return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
def visit_CFile(self, node): self.defns = [] self.includes = set() node = super(Backend, self).generic_visit(node) for defn in self.defns: node.body.insert(0, defn) for include in self.includes: node.body.insert(0, StringTemplate("#include <{}>".format(include))) return node
def test_indent_1(self): d = {'cond': Constant(1)} t = """\ while($cond) printf("hello"); """ tree = StringTemplate(t, d) self._check(tree, """\ while(1) printf("hello");""")
def test_template_parent_pointers(self): from ctree.c.nodes import SymbolRef symbol = SymbolRef("hello") template = "char *str = $val" template_args = { 'val': symbol, } node = StringTemplate(template, template_args) self.assertIs(symbol.parent, node)
def get_load_a_block(self, transpose, template_args): if transpose: raise NotImplementedError() else: return StringTemplate( """ // make a local aligned copy of A's block for( int j = 0; j < K; j++ ) for( int i = 0; i < M; i++ ) a[i+j*$CY] = A[i+j*$lda]; """, template_args)
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg # tree = Desugar().visit(tree) inliner = InlineEnvironment(self.symbol_table) tree = inliner.visit(tree) tree = PyBasicConversions().visit(tree) tree.body = inliner.files + tree.body # tree.find(C.For).pragma = 'omp parallel for' tree.name = self.original_tree.body[0].name tree.body.insert(0, StringTemplate("#include <math.h>")) # print(tree) return [tree]
def _gen_store_c_block(self, rx, ry, lda): """ Return a subtree that loads a block of 'c'. """ stmts = [StringTemplate("// Store the c block")] for j in range(rx): for i in range(ry / 4): stmt = mm256_storeu_pd( Add(SymbolRef("C"), Constant(i * 4 + j * lda)), MultiArrayRef("c", i, j)) stmts.append(stmt) return Block(stmts)
def test_indent_0(self): d = {'cond': Constant(1)} t = """\ while($cond) printf("hello"); """ tree = While(Constant(0), [StringTemplate(t, d)]) self._check(tree, """\ while (0) { while(1) printf("hello"); }""")
def _gen_load_c_block(self, rx, ry, lda): """ Return a subtree that loads a block of 'c'. """ stmts = [StringTemplate("// Load a block of c", {})] for j in range(rx): for i in range(ry / 4): stmt = Assign( MultiArrayRef("c", i, j), mm256_loadu_pd( Add(SymbolRef("C"), Constant(i * 4 + j * lda)))) stmts.append(stmt) return Block(stmts)
def transform(self, tree, program_cfg): arg_cfg, tune_cfg = program_cfg channels, height, width = arg_cfg[0] cfg = { 'pad_h': C.Constant(self.pad_h), 'pad_w': C.Constant(self.pad_w), 'stride_h': C.Constant(self.stride_h), 'stride_w': C.Constant(self.stride_w), 'kernel_h': C.Constant(self.kernel_h), 'kernel_w': C.Constant(self.kernel_w), 'channels': C.Constant(channels), 'height': C.Constant(height), 'width': C.Constant(width), } im2col = C.FunctionDecl( None, C.SymbolRef("im2col"), [C.SymbolRef("data_im", arg_cfg[1]()), C.SymbolRef("data_col", arg_cfg[1]())], [StringTemplate(""" int stride_h = $stride_h; int stride_w = $stride_w; int pad_h = $pad_h; int pad_w = $pad_w; int kernel_h = $kernel_h; int kernel_w = $kernel_w; int channels = $channels; int height = $height; int width = $width; int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { int w_offset = c % kernel_w; int h_offset = (c / kernel_w) % kernel_h; int c_im = c / kernel_h / kernel_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { int h_pad = h * stride_h - pad_h + h_offset; int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) data_col[(c * height_col + h) * width_col + w] = data_im[(c_im * height + h_pad) * width + w_pad]; else data_col[(c * height_col + h) * width_col + w] = 0; } } } """, cfg)]) return [C.CFile('im2col', [im2col])]
def get_asm_body(node, scalars, refs, type_map): body = """ __asm__ volatile ( ".align 3\\n" "__hwacha_body:\\n" """ asm_body = [] translator = HwachaASMTranslator(scalars, refs, asm_body, type_map) for s in node.body: translator.visit(s) for s in asm_body: body += "\"" + s + "\"\n" body += "\" vstop\\n\"\n" body += " );" return StringTemplate(body)
def test_template_with_transformer(self): from ctree.visitors import NodeTransformer from ctree.c.nodes import String, SymbolRef template = "char *str = $val" template_args = { 'val': SymbolRef("hello"), } tree = StringTemplate(template, template_args) self._check(tree, 'char *str = hello') class SymbolsToStrings(NodeTransformer): def visit_SymbolRef(self, node): return String(node.name) tree = SymbolsToStrings().visit(tree) self._check(tree, 'char *str = "hello"')
def test_template_parent_pointers_with_transformer(self): from ctree.visitors import NodeTransformer from ctree.c.nodes import String, SymbolRef template = "char *str = $val" template_args = { 'val': SymbolRef("hello"), } class SymbolsToStrings(NodeTransformer): def visit_SymbolRef(self, node): return String(node.name) tree = StringTemplate(template, template_args) tree = SymbolsToStrings().visit(tree) template_node, string = tree, tree.val self.assertIs(string.parent, template_node)
def build_kernel(self, kernel_src, kernel_name, kernel_args): kernel_src = C.CFile('generated', [StringTemplate( """ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) """ ), kernel_src]) try: program = cl.clCreateProgramWithSource( latte.config.cl_ctx, kernel_src.codegen()).build() kernel = program[kernel_name] except cl.BuildProgramFailureError as e: logger.error("Failed build program:\n %s", kernel_src.codegen()) raise e self.kernels[kernel_name] = kernel for index, arg in enumerate(kernel_args): kernel.setarg(index, self.cl_buffers[arg], ctypes.sizeof(cl.cl_mem)) logger.debug(kernel_src)
def visit_FunctionCall(self, node): if isinstance(node.func, C.SymbolRef): if node.func.name == "rand": if "OPENCL" in latte.config.parallel_strategy: import struct platform_c_maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1 return StringTemplate("((({} + get_global_id(0)) * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1)) >> 16".format(int(random.random() * platform_c_maxint))) return C.Div(node, C.Cast(ctypes.c_float(), C.SymbolRef("RAND_MAX"))) #ANAND: 10/11/2016 Adding following uutility to convert python max to c max if isinstance(node.func, C.SymbolRef): if node.func.name == "max": return C.FunctionCall(C.SymbolRef("MAX"), [node.args[0],node.args[1]]) return node
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() scalar_data_type = get_c_type_from_numpy_dtype( np.dtype(input_data.scalar_type))() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = scalar_data_type apply_one.return_type = data_type # TODO: figure out which data type to actually preserve # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE) array_add_template = StringTemplate( r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(arr[i], scalar); } """, {'length': Constant(length)}) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("arr", pointer()), SymbolRef("scalar", scalar_data_type), SymbolRef("output", pointer()) ], defn=[array_add_template]) ], 'omp') return [array_op]
def insert_cast(body, shape, name, dtype, _global=False): shape_str = "".join("[{}]".format(d) for d in shape) body.insert( 0, StringTemplate( "$global$type (* $arg_name)$shape = ($global$type (*)$cast) _$arg_name;", { "arg_name": C.SymbolRef(name), "shape": C.SymbolRef(shape_str), "cast": C.SymbolRef(shape_str), "type": C.SymbolRef( ctree.types.codegen_type( ctree.types.get_c_type_from_numpy_dtype(dtype)())), "global": C.SymbolRef("__global " if _global else "") }))
def _gen_reduce_for_loop(self, loop, var, size): looplen1 = loop.test.right loopincr = loop.incr.value.value return StringTemplate(""" #pragma omp parallel for simd for (int x = 0; x < $size; ++x) { float sum = _$arr[x]; #pragma unroll for (int i = 1; i < $batch_size; ++ i) { sum += _$arr[i * $size + x]; } _$arr[x] = sum; } """, {'size': C.Constant(size), 'batch_size': C.Constant(self.batch_size), 'arr': C.SymbolRef(var), 'node_list': C.SymbolRef("node_list_"), 'reduce_node': C.SymbolRef("reduce_node_"), 'looplen1': C.Constant(looplen1.value), 'loopincr': C.Constant(loopincr) })
def transform(self, py_ast, program_config): # Get the initial data input_data = program_config[0] length = np.prod(input_data.size) pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape) data_type = get_c_type_from_numpy_dtype(input_data.dtype)() apply_one = PyBasicConversions().visit(py_ast.body[0]) apply_one.name = 'apply' apply_one.params[0].type = data_type apply_one.params[1].type = data_type apply_one.return_type = data_type array_add_template = StringTemplate(r""" #pragma omp parallel for for (int i = 0; i < $length; i++) { output[i] = apply(input1[i], input2[i]); } """, { 'length': Constant(length) }) array_op = CFile("generated", [ CppInclude("omp.h"), CppInclude("stdio.h"), apply_one, FunctionDecl(None, FUNC_NAME, params=[ SymbolRef("input1", pointer()), SymbolRef("input2", pointer()), SymbolRef("output", pointer()) ], defn=[ array_add_template ]) ], 'omp') return [array_op]
def test_dotgen(self): tree = StringTemplate("return $one $two", {"one": Constant(1), "two": Constant(2)}) dot = tree.to_dot()
def test_dotgen(self): tree = StringTemplate("return $one $two", { 'one': Constant(1), 'two': Constant(2), }) dot = tree.to_dot()