def _generate_kernel_scatter(self): kernel_scatter = cgen.Module( [cgen.Comment('#### Post kernel scatter ####')]) ci = self._components['LIB_CELL_INDEX_0'] inner_l = [] src_sym = '_sgpx' dst_sym = '_shpx' # add dats to omp shared and init global array reduction for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.ParticleDat) and mode.write: tsym = self._components['PARTICLE_DAT_PARTITION'].idict[symbol] inner_l.append( DSLStrideScatter(tsym, symbol, obj.ncomp, dst_sym, src_sym, self._components['CCC_MAX'])) inner_l.append(cgen.Line(dst_sym + '++;')) inner = cgen.Module(inner_l) g = self._components['CELL_LIST_ITER'](src_sym, ci, inner) kernel_scatter.append( cgen.Initializer(cgen.Value('INT64', dst_sym), '0')) kernel_scatter.append(g) self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
def _dump_storage(self, iet, storage): mapper = {} for k, v in storage.items(): # Expr -> LocalExpr ? if k.is_Expression: mapper[k] = v continue # allocs/pallocs allocs = flatten(v.allocs) for tid, body in as_mapper(v.pallocs, itemgetter(0), itemgetter(1)).items(): header = self.lang.Region._make_header(tid.symbolic_size) init = c.Initializer(c.Value(tid._C_typedata, tid.name), self.lang['thread-num']) allocs.append(c.Module((header, c.Block([init] + body)))) if allocs: allocs.append(c.Line()) # frees/pfrees frees = [] for tid, body in as_mapper(v.pfrees, itemgetter(0), itemgetter(1)).items(): header = self.lang.Region._make_header(tid.symbolic_size) init = c.Initializer(c.Value(tid._C_typedata, tid.name), self.lang['thread-num']) frees.append(c.Module((header, c.Block([init] + body)))) frees.extend(flatten(v.frees)) if frees: frees.insert(0, c.Line()) mapper[k] = k._rebuild(body=List(header=allocs, body=k.body, footer=frees), **k.args_frozen) processed = Transformer(mapper, nested=True).visit(iet) return processed
def _generate_lib_inner_loop_block(self): # generate j gather #'J_GATHER' cj = self._components['LIB_CELL_INDEX_1'] j_gather = cgen.Module([ cgen.Comment('#### Pre kernel j gather ####'), ]) inner_l = [] src_sym = '_tmp_jgpx' dst_sym = self._components['CCC_1'] # add dats to omp shared and init global array reduction for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.ParticleDat): tsym = self._components['PARTICLE_DAT_PARTITION'].jdict[symbol] inner_l.append( DSLStrideGather(symbol, tsym, obj.ncomp, src_sym, dst_sym, self._components['CCC_MAX'])) inner_l.append(cgen.Line(dst_sym + '++;')) inner = cgen.Module(inner_l) g = self._components['CELL_LIST_ITER'](src_sym, cj, inner) j_gather.append(cgen.Initializer(cgen.Value('INT64', dst_sym), '0')) j_gather.append(g) self._components['J_GATHER'] = j_gather
def visit_Iteration(self, o): body = flatten(self._visit(i) for i in o.children) _min = o.limits[0] _max = o.limits[1] # For backward direction flip loop bounds if o.direction == Backward: loop_init = 'int %s = %s' % (o.index, ccode(_max)) loop_cond = '%s >= %s' % (o.index, ccode(_min)) loop_inc = '%s -= %s' % (o.index, o.limits[2]) else: loop_init = 'int %s = %s' % (o.index, ccode(_min)) loop_cond = '%s <= %s' % (o.index, ccode(_max)) loop_inc = '%s += %s' % (o.index, o.limits[2]) # Append unbounded indices, if any if o.uindices: uinit = ['%s = %s' % (i.name, ccode(i.symbolic_min)) for i in o.uindices] loop_init = c.Line(', '.join([loop_init] + uinit)) ustep = [] for i in o.uindices: op = '=' if i.is_Modulo else '+=' ustep.append('%s %s %s' % (i.name, op, ccode(i.symbolic_incr))) loop_inc = c.Line(', '.join([loop_inc] + ustep)) # Create For header+body handle = c.For(loop_init, loop_cond, loop_inc, c.Block(body)) # Attach pragmas, if any if o.pragmas: handle = c.Module(o.pragmas + (handle,)) return handle
def _generate_kernel_call(self): kernel_call = cgen.Module( [cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] g = self._components['PARTICLE_DAT_C'][symbol] kernel_call_symbols.append(g.kernel_arg) kernel_call.append(g.kernel_create_j_arg) self._components['KERNEL_GATHER'] += g.kernel_create_i_arg self._components['KERNEL_SCATTER'] += g.kernel_create_i_scatter kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def visit_Operator(self, o): blankline = c.Line("") # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: esigns.append( c.FunctionDeclaration(c.Value(i.root.retval, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([i.root.ccode, blankline]) # Header files, extra definitions, ... header = [c.Line(i) for i in o._headers] includes = [c.Include(i, system=False) for i in o._includes] includes += [blankline] cdefs = [ i._C_typedecl for i in o.parameters if i._C_typedecl is not None ] cdefs = filter_sorted(cdefs, key=lambda i: i.tpname) if o._compiler.src_ext == 'cpp': cdefs += [c.Extern('C', signature)] cdefs = [i for j in cdefs for i in (j, blankline)] return c.Module(header + includes + cdefs + esigns + [blankline, kernel] + efuncs)
def visit_AugmentedExpression(self, o): code = c.Statement("%s %s= %s" % (ccode(o.expr.lhs, dtype=o.dtype), o.op, ccode(o.expr.rhs, dtype=o.dtype))) if o.pragmas: code = c.Module(list(o.pragmas) + [code]) return code
def get_cpp_pre_loop_code_ast(self): """ Return the code to place before the loop. """ _s = 'std::chrono::high_resolution_clock::time_point _loop_timer_t0 ='\ ' std::chrono::high_resolution_clock::now(); \n' return cgen.Module([cgen.Line(_s)])
def _generate_lib_outer_loop(self): block = cgen.Block([self._components['LIB_KERNEL_CALL']]) i = self._components['LIB_PAIR_INDEX_0'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared += sx + ',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel default(none) shared(' + shared + ')') parallel_region = cgen.Block(( cgen.Value('int', '_thread_start'), cgen.Value('int', '_thread_end'), cgen.Line( 'get_thread_decomp((int)_N_LOCAL, &_thread_start, &_thread_end);' ), cgen.For('int ' + i + '= _thread_start', i + '< _thread_end', i + '++', block))) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma, parallel_region ]) self._components['LIB_OUTER_LOOP'] = loop
def gen_op(op_name, attrs, inputs, outputs, output_shapes, kernel_class_name, kernel_header): # Add the headers into the module contents = [] contents.append(c.Include("tensorflow/core/framework/op.h", system = False)) contents.append(c.Include("tensorflow/core/framework/shape_inference.h", system = False)) contents.append(c.Include("tensorflow/core/framework/op_kernel.h", system = False)) contents.append(c.Include(kernel_header, system = False)) # Name space declarations contents.append(c.Line()) contents.append(c.Statement("using namespace tensorflow")) contents.append(c.Line()) shape_fn = gen_shape_fn(output_shapes); # Registration macro reg_macro = gen_reg_op_macro_str(op_name, attrs, inputs, outputs, shape_fn) contents.append(c.Statement(reg_macro)) contents.append(c.Line()) class_defn = gen_op_kernel_class_defn(kernel_class_name) contents.extend(class_defn) contents.append(c.Line()) kernel_build_macro = gen_kernel_build_macro(op_name, kernel_class_name) contents.append(c.Statement(kernel_build_macro)) return c.Module(contents)
def visit_Operator(self, o, mode='all'): # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Line(), c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: prefix = ' '.join(i.root.prefix + (i.root.retval, )) esigns.append( c.FunctionDeclaration(c.Value(prefix, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([self._visit(i.root), blankline]) # Definitions headers = [c.Define(*i) for i in o._headers] + [blankline] # Header files includes = self._operator_includes(o) + [blankline] # Type declarations typedecls = self._operator_typedecls(o, mode) if mode in ('all', 'public') and o._compiler.src_ext in ('cpp', 'cu'): typedecls.append(c.Extern('C', signature)) typedecls = [i for j in typedecls for i in (j, blankline)] return c.Module(headers + includes + typedecls + esigns + [blankline, kernel] + efuncs)
def _generate_lib_outer_loop(self): block = cgen.Block([self._components['LIB_KERNEL_GATHER'], self._components['LIB_INNER_LOOP'], self._components['LIB_KERNEL_SCATTER']]) i = self._components['LIB_PAIR_INDEX_0'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared+= sx+',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel for schedule(static) // default(shared) shared(' + shared + ')') if runtime.OMP_NUM_THREADS is None: pragma = cgen.Comment(pragma) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma, cgen.For('int ' + i + '=0', i + '<_N_LOCAL', i+'++', block) ]) self._components['LIB_OUTER_LOOP'] = loop
def _generate_kernel_scatter(self): kernel_scatter = cgen.Module( [cgen.Comment('#### Post kernel scatter ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): pass for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): pass elif issubclass(type(dat[1][0]), host.Matrix)\ and dat[1][1].write\ and dat[1][0].ncomp <= self._gather_size_limit: isym = dat[0] + 'i' nc = dat[1][0].ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[dat[1][0].dtype] ix = self._components['LIB_PAIR_INDEX_0'] b = cgen.Assign(dat[0] + '[' + str(nc) + '*' + ix + '+_tx]', isym + '[_tx]') g = cgen.For('int _tx=0', '_tx<' + str(nc), '_tx++', cgen.Block([b])) kernel_scatter.append(g) self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
def generate(self): """Generate (i.e. yield) the source code of the module line-by-line. """ body = [] body += (self.preamble + [cgen.Line()] + self.body) return cgen.Module(body)
def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): _kernel_arg_decls.append( cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0]))) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) # print host.ctypes_map[dat[1][0].dtype], dat[1][0].dtype if issubclass(type(dat[1][0]), host._Array): kernel_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) if not dat[1][1].write: kernel_arg = cgen.Const(kernel_arg) _kernel_arg_decls.append(kernel_arg) elif issubclass(type(dat[1][0]), host.Matrix): # MAKE STRUCT TYPE dtype = dat[1][0].dtype ti = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'i'))) tj = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'j'))) if not dat[1][1].write: ti = cgen.Const(ti) tj = cgen.Const(tj) typename = '_' + dat[0] + '_t' _kernel_structs.append( cgen.Typedef(cgen.Struct('', [ti, tj], typename))) # MAKE STRUCT ARG _kernel_arg_decls.append(cgen.Value(typename, dat[0])) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def _generate_lib_src(self): self._components['LIB_SRC'] = cgen.Module([ self._components['KERNEL_STRUCT_TYPEDEFS'], cgen.Comment('#### Kernel function ####'), self._components['KERNEL_FUNC'], cgen.Comment('#### Library function ####'), self._components['LIB_FUNC'] ])
def _generate_lib_inner_loop(self): i = self._components['LIB_PAIR_INDEX_0'] j = self._components['LIB_PAIR_INDEX_1'] b = self._components['LIB_INNER_LOOP_BLOCK'] self._components['LIB_INNER_LOOP'] = cgen.Module([ cgen.For('int ' + j + '=0', j + '<' + i, j + '++', b), cgen.For('int ' + j + '=1+' + i, j + '< _N_LOCAL', j + '++', b), ])
def _generate_kernel_headers(self): s = [] if self._kernel.headers is not None: for x in self._kernel.headers: s.append(x.ast) s.append(self.loop_timer.get_cpp_headers_ast()) self._components['KERNEL_HEADERS'] = cgen.Module(s)
def generate(self): """@todo: Docstring for generate. :returns: @todo """ return c.Module( [DGEMV_SRC] + [self.generate_optimmat_code(pos) for pos in range(self._sites)])
def visit_Section(self, o): body = flatten(self._visit(i) for i in o.children) if o.is_subsection: header = [] footer = [] else: header = [c.Comment("Begin %s" % o.name)] footer = [c.Comment("End %s" % o.name)] return c.Module(header + body + footer)
def get_cpp_post_loop_code_ast(self): """ Return the code to place after the loop. """ _s = 'std::chrono::high_resolution_clock::time_point _loop_timer_t1 ='\ ' std::chrono::high_resolution_clock::now(); \n' \ ' std::chrono::duration<double> _loop_timer_res = _loop_timer_t1'\ ' - _loop_timer_t0; \n' \ '*_loop_timer_return += (double) _loop_timer_res.count(); \n' return cgen.Module([cgen.Line(_s)])
def _generate_kernel_headers(self): s = [cgen.Include('cuda_generic.h', system=False)] if self._kernel.headers is not None: for x in self._kernel.headers: s.append(x.ast) s.append(self.loop_timer.get_cpp_headers_ast()) self._components['KERNEL_HEADERS'] = cgen.Module(s)
def _generate_map_macros(self): g = cgen.Module([cgen.Comment('#### KERNEL_MAP_MACROS ####')]) for i, dat in enumerate(self._dat_dict.items()): if type(dat[1][0]) is cuda_data.GlobalArray or \ issubclass(type(dat[1][0]), cuda_base.Array): g.append(cgen.Define(dat[0] + '(x)', '(' + dat[0] + '[(x)])')) if issubclass(type(dat[1][0]), cuda_base.Matrix): g.append(cgen.Define(dat[0] + '(y)', dat[0] + '.i[(y)]')) self._components['KERNEL_MAP_MACROS'] = g
def _generate_lib_inner_loop(self): i = self._components['LIB_PAIR_INDEX_0'] j = self._components['LIB_PAIR_INDEX_1'] self._components['LIB_LOOP_J_PREPARE'] = cgen.Module([ cgen.Line('const int _icell = _CRL[' + i + '];'), cgen.Line('int * _JJSTORE = _JSTORE[' + self._components['OMP_THREAD_INDEX_SYM'] + '];'), cgen.Line('int _nn = 0;'), ]) b = self._components['LIB_INNER_LOOP_BLOCK'] self._components['LIB_INNER_LOOP'] = cgen.Module([ cgen.For('int _k=0', '_k<27', '_k++', b), cgen.For( 'int _k2=0', '_k2<_nn', '_k2++', cgen.Block([ cgen.Line('const int ' + j + ' = _JJSTORE[_k2];'), self._components['LIB_KERNEL_CALL'], ])) ])
def visit_Iteration(self, o): body = flatten(self.visit(i) for i in o.children) # Start if o.offsets[0] != 0: start = str(o.limits[0] + o.offsets[0]) try: start = eval(start) except (NameError, TypeError): pass else: start = o.limits[0] # Bound if o.offsets[1] != 0: end = str(o.limits[1] + o.offsets[1]) try: end = eval(end) except (NameError, TypeError): pass else: end = o.limits[1] # For backward direction flip loop bounds if o.direction == Backward: loop_init = 'int %s = %s' % (o.index, ccode(end)) loop_cond = '%s >= %s' % (o.index, ccode(start)) loop_inc = '%s -= %s' % (o.index, o.limits[2]) else: loop_init = 'int %s = %s' % (o.index, ccode(start)) loop_cond = '%s <= %s' % (o.index, ccode(end)) loop_inc = '%s += %s' % (o.index, o.limits[2]) # Append unbounded indices, if any if o.uindices: uinit = [ '%s = %s' % (i.name, ccode(i.symbolic_start)) for i in o.uindices ] loop_init = c.Line(', '.join([loop_init] + uinit)) ustep = [ '%s = %s' % (i.name, ccode(i.symbolic_incr)) for i in o.uindices ] loop_inc = c.Line(', '.join([loop_inc] + ustep)) # Create For header+body handle = c.For(loop_init, loop_cond, loop_inc, c.Block(body)) # Attach pragmas, if any if o.pragmas: handle = c.Module(o.pragmas + (handle, )) return handle
def _generate_kernel_call(self): kernel_call = cgen.Module([ cgen.Comment('#### Kernel call arguments ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) kernel_call_symbols = [] shared_syms = self._components['OMP_SHARED_SYMS'] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): sym = dat[0] if issubclass(type(dat[1][0]), data.GlobalArrayClassic): sym += '[' + self._components['OMP_THREAD_INDEX_SYM'] + ']' kernel_call_symbols.append(sym) shared_syms.append(dat[0]) elif issubclass(type(dat[1][0]), host.Matrix): call_symbol = dat[0] + '_c' kernel_call_symbols.append(call_symbol) nc = str(dat[1][0].ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc isym = dat[0] + _ishift g = cgen.Value('_' + dat[0] + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + '}') kernel_call.append(g) shared_syms.append(dat[0]) else: print("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def includes(self): statements = includes.copyright() statements += [cgen.Define('M_PI', '3.14159265358979323846')] statements += includes.common_include() if self.io: statements += includes.io_include() if self.pluto: statements += includes.pluto_include() if self.profiling: statements += includes.profiling_include() return cgen.Module(statements)
def _generate_kernel_headers(self): s = self._components['LIB_HEADERS'] if self._kernel.headers is not None: if hasattr(self._kernel.headers, "__iter__"): for x in self._kernel.headers: s.append(x.ast) else: s.append(self._kernel.headers.ast) s.append(self.loop_timer.get_cpp_headers_ast()) self._components['KERNEL_HEADERS'] = cgen.Module(s)
def _generate_lib_inner_loop(self): c = self._components i = c['LIB_PAIR_INDEX_0'] j = c['LIB_PAIR_INDEX_1'] ccc_i = c['CCC_0'] ccc_j = c['CCC_1'] ci = c['LIB_CELL_INDEX_0'] cj = c['LIB_CELL_INDEX_1'] nloc = c['N_LOCAL'] ec = '_' + c['EXEC_COUNT'] iif = c['PARTICLE_DAT_PARTITION'].idict[c['TMP_INDEX']] def ifnothalo(b): return cgen.Block((cgen.If(iif + '[' + i + ']<' + nloc, b), )) kg = self._components['KERNEL_GATHER'] ks = self._components['KERNEL_SCATTER'] loop_other = cgen.Block((cgen.For( 'INT64 ' + i + '=0', i + '<' + ccc_i, i + '++', ifnothalo( cgen.Block( (cgen.Line(kg), cgen.For('INT64 ' + j + '=0', j + '<' + ccc_j, j + '++', cgen.Block(self._components['LIB_KERNEL_CALL'])), cgen.Line(ks), cgen.Line(ec + '+=' + ccc_j + ';'))))), )) loop_same = cgen.Block((cgen.For( 'INT64 ' + i + '=0', i + '<' + ccc_i, i + '++', ifnothalo( cgen.Block( (cgen.Line(kg), cgen.For('INT64 ' + j + '=0', j + '<' + i, j + '++', cgen.Block(self._components['LIB_KERNEL_CALL'])), cgen.For('INT64 ' + j + '=1+' + i, j + '<' + ccc_j, j + '++', cgen.Block(self._components['LIB_KERNEL_CALL'])), cgen.Line(ks), cgen.Line(ec + '+=' + ccc_j + '-1;'))))), )) cell_cond = cgen.If(ci + '==' + cj, loop_same, loop_other) b = cgen.Block( (cgen.Line('const INT64 {jcell} = {icell} + _OFFSET[_k];'.format( jcell=self._components['LIB_CELL_INDEX_1'], icell=self._components['LIB_CELL_INDEX_0'])), self._components['J_GATHER'], cell_cond)) self._components['LIB_INNER_LOOP'] = cgen.Module([ cgen.For('int _k=0', '_k<27', '_k++', b), ])
def _generate_kernel_gather(self): kernel_gather = cgen.Module([ cgen.Comment('#### Pre kernel gather ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) shared_syms = self._components['OMP_SHARED_SYMS'] for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] shared_syms.append(symbol) if issubclass(type(obj), data.GlobalArrayClassic): isym = symbol + '_c' val = symbol + '[' + self._components[ 'OMP_THREAD_INDEX_SYM'] + ']' g = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], isym)) if not mode.write: g = cgen.Const(g) g = cgen.Initializer(g, val) kernel_gather.append(g) elif issubclass(type(obj), host.Matrix) \ and mode.write \ and obj.ncomp <= self._gather_size_limit: isym = symbol + 'i' nc = obj.ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[obj.dtype] t = '{' for tx in range(nc): t += '*(' + symbol + '+' + self._components[ 'LIB_PAIR_INDEX_0'] t += '*' + str(nc) + '+' + str(tx) + '),' t = t[:-1] + '}' g = cgen.Value(dtype, isym + ncb) g = cgen.Initializer(g, t) kernel_gather.append(g) self._components['LIB_KERNEL_GATHER'] = kernel_gather