def gen_local_macro(self): dim = len(self.output_grid.shape) index = SymbolRef("d%d" % (dim - 1)) for d in reversed(range(dim - 1)): base = Add(get_local_size(dim - 1), Constant(2 * self.ghost_depth[dim - 1])) for s in range(d + 1, dim - 1): base = Mul( base, Add(get_local_size(s), Constant(2 * self.ghost_depth[s])) ) index = Add( index, Mul(base, SymbolRef("d%d" % d)) ) index._force_parentheses = True index.right.right._force_parentheses = True return index
def local_array_macro(self, point): dim = len(self.output_grid.shape) index = get_local_id(dim) for d in reversed(range(dim)): index = Add( Mul( index, Add( get_local_size(d), Constant(2 * self.ghost_depth[d]) ), ), point[d] ) return FunctionCall(SymbolRef("local_array_macro"), point)
def gen_decls(dim, ghost_depth): thread_id = get_local_id(dim - 1) num_threads = get_local_size(dim - 1) block_size = Add( get_local_size(dim - 1), Constant(ghost_depth[dim - 1] * 2) ) for d in reversed(range(0, dim - 1)): base = get_local_size(dim - 1) for s in range(d, dim - 2): base = Mul(get_local_size(s + 1), base) thread_id = Add( Mul(get_local_id(d), base), thread_id ) num_threads = Mul(get_local_size(d), num_threads) block_size = Mul( Add(get_local_size(d), Constant(ghost_depth[d] * 2)), block_size ) return thread_id, num_threads, block_size
def load_shared_memory_block(self, target, ghost_depth): dim = len(self.output_grid.shape) body = [] thread_id, num_threads, block_size = gen_decls(dim, ghost_depth) body.extend([Assign(SymbolRef("thread_id", ct.c_int()), thread_id), Assign(SymbolRef("block_size", ct.c_int()), block_size), Assign(SymbolRef("num_threads", ct.c_int()), num_threads) ]) base = None for i in reversed(range(0, dim - 1)): if base is not None: base = Mul(Add(get_local_size(i + 1), Constant(self.ghost_depth[i + 1] * 2)), base) else: base = Add(get_local_size(i + 1), Constant(self.ghost_depth[i + 1] * 2)) if base is not None: local_indices = [ Assign( SymbolRef("local_id%d" % (dim - 1), ct.c_int()), Div(SymbolRef('tid'), base) ), Assign( SymbolRef("r_%d" % (dim - 1), ct.c_int()), Mod(SymbolRef('tid'), base) ) ] else: local_indices = [ Assign( SymbolRef("local_id%d" % (dim - 1), ct.c_int()), SymbolRef('tid') ), Assign( SymbolRef("r_%d" % (dim - 1), ct.c_int()), SymbolRef('tid') ) ] for d in reversed(range(0, dim - 1)): base = None for i in reversed(range(d + 1, dim)): if base is not None: base = Mul( Add(get_local_size(i), ghost_depth[i] * 2), base ) else: base = Add(get_local_size(i), Constant(ghost_depth[i] * 2)) if base is not None and d != 0: local_indices.append( Assign( SymbolRef("local_id%d" % d, ct.c_int()), Div(SymbolRef('r_%d' % (d + 1)), base) ) ) local_indices.append( Assign( SymbolRef("r_%d" % d, ct.c_int()), Mod(SymbolRef('r_%d' % (d + 1)), base) ) ) else: local_indices.append( Assign( SymbolRef("local_id%d" % d, ct.c_int()), SymbolRef('r_%d' % (d + 1)) ) ) body.append( For( Assign(SymbolRef('tid', ct.c_int()), SymbolRef('thread_id')), Lt(SymbolRef('tid'), SymbolRef('block_size')), AddAssign(SymbolRef('tid'), SymbolRef('num_threads')), local_indices + [Assign( ArrayRef( target, SymbolRef('tid') ), ArrayRef( SymbolRef(self.input_names[0]), self.global_array_macro( [FunctionCall( SymbolRef('clamp'), [Cast(ct.c_int(), Sub(Add( SymbolRef("local_id%d" % (dim - d - 1)), Mul(FunctionCall( SymbolRef('get_group_id'), [Constant(d)]), get_local_size(d)) ), Constant(self.kernel.ghost_depth[d]))), Constant(0), Constant( self.arg_cfg[0].shape[d]-1 ) ] ) for d in range(0, dim)] ) ) )] ) ) return body