def sync(tensor, offset): if tensor is None: return offset _logger.debug('index of tensor <%s>: %s', tensor.name, tensor.st_idx) stage_offset = soda_util.serialize( tensor.st_idx, self.tile_size) _logger.debug('offset of tensor <%s>: %d', tensor.name, stage_offset) loads = visitor.get_load_dict(tensor) for name in loads: loads[name] = tuple(ref.idx for ref in loads[name]) _logger.debug( 'loads: %s', ', '.join( '%s@%s' % (name, util.lst2str(map(util.idx2str, indices))) for name, indices in loads.items())) for n in loads: loads[n] = soda_util.serialize_iter( loads[n], self.tile_size) for l in loads.values(): l[0], l[-1] = (stage_offset - max(l), stage_offset - min(l)) del l[1:-1] if len(l) == 1: l.append(l[-1]) _logger.debug( 'load offset range in tensor %s: %s', tensor.name, '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v) for n, v in loads.items()))) for parent in tensor.parents.values(): tensor_distance = next( reversed(tensor.ld_offsets[parent.name])) _logger.debug('tensor distance: %s', tensor_distance) _logger.debug( 'want to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) tensor_offset = (parent.st_delay + tensor_distance - stage_offset) if offset < tensor_offset: _logger.debug( 'but tensor <%s> won\'t be available until offset %d', parent.name, tensor_offset) offset = tensor_offset _logger.debug( 'need to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) return offset
def _get_window_for(self, node): loads = node.get_loads() # [Load, ...] load_names = {l.name for l in loads if l.name not in self.extra_params} windows = { name: sorted({l.idx for l in loads if l.name == name}, key=lambda x: soda_util.serialize(x, self.tile_size)) for name in load_names } _logger.debug( 'window for %s@(%s) is %s' % (node.name, ', '.join(map(str, node.expr[0].idx)), windows)) return windows
def test_deserialize(self): idx = (42, 23, 233) tile_size = (2333, 233, 0) self.assertTupleEqual( idx, tuple(util.deserialize(util.serialize(idx, tile_size), tile_size)))
def ld_offsets(self): return collections.OrderedDict( (name, collections.OrderedDict( (soda_util.serialize(ref.idx, self._tile_size), ref) for ref in refs)) for name, refs in self.ld_refs.items())
def st_offset(self): return soda_util.serialize(self.st_idx, self._tile_size) + self.st_delay
def get_stencil_distance(stencil_window, tile_size): return (max(soda_util.serialize_iter(stencil_window, tile_size)) + soda_util.serialize(get_stencil_window_offset(stencil_window), tile_size))
def chronological_tensors(self): """Computes the offsets of tensors. Returns: A list of Tensor, in chronological order. """ _logger.info('calculate tensor offsets') processing_queue = collections.deque(list(self.input_names)) processed_tensors = set(self.input_names) chronological_tensors = list(map(self.tensors.get, self.input_names)) for tensor in chronological_tensors: _logger.debug('tensor <%s> is at offset %d' % (tensor.name, tensor.st_offset)) _logger.debug('processing queue: %s', processing_queue) _logger.debug('processed_tensors: %s', processed_tensors) while processing_queue: tensor = self.tensors[processing_queue.popleft()] _logger.debug('inspecting tensor %s\'s children' % tensor.name) for child in tensor.children.values(): if ({x.name for x in child.parents.values()} <= processed_tensors and child.name not in processed_tensors): # good, all inputs are processed # can determine offset of current tensor _logger.debug( 'input%s for tensor <%s> (i.e. %s) %s processed', '' if len(child.parents) == 1 else 's', child.name, ', '.join([x.name for x in child.parents.values()]), 'is' if len(child.parents) == 1 else 'are') stage_offset = soda_util.serialize(child.st_idx, self.tile_size) # synchronization check def sync(tensor, offset): if tensor is None: return offset _logger.debug('index of tensor <%s>: %s', tensor.name, tensor.st_idx) stage_offset = soda_util.serialize( tensor.st_idx, self.tile_size) _logger.debug('offset of tensor <%s>: %d', tensor.name, stage_offset) loads = visitor.get_load_dict(tensor) for name in loads: loads[name] = tuple(ref.idx for ref in loads[name]) _logger.debug( 'loads: %s', ', '.join( '%s@%s' % (name, util.lst2str(map(util.idx2str, indices))) for name, indices in loads.items())) for n in loads: loads[n] = soda_util.serialize_iter( loads[n], self.tile_size) for l in loads.values(): l[0], l[-1] = (stage_offset - max(l), stage_offset - min(l)) del l[1:-1] if len(l) == 1: l.append(l[-1]) _logger.debug( 'load offset range in tensor %s: %s', tensor.name, '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v) for n, v in loads.items()))) for parent in tensor.parents.values(): tensor_distance = next( reversed(tensor.ld_offsets[parent.name])) _logger.debug('tensor distance: %s', tensor_distance) _logger.debug( 'want to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) tensor_offset = (parent.st_delay + tensor_distance - stage_offset) if offset < tensor_offset: _logger.debug( 'but tensor <%s> won\'t be available until offset %d', parent.name, tensor_offset) offset = tensor_offset _logger.debug( 'need to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) return offset _logger.debug( 'intend to generate tensor <%s> at offset %d', child.name, child.st_delay) synced_offset = sync(child, child.st_delay) _logger.debug('synced offset: %s', synced_offset) child.st_delay = synced_offset _logger.debug( 'decide to generate tensor <%s> at offset %d', child.name, child.st_delay) # add delay for sibling in child.parents.values(): delay = child.st_delay - (sibling.st_delay + list( child.ld_offsets[sibling.name].keys())[-1] - stage_offset) if delay > 0: _logger.debug( 'tensor %s arrives at tensor <%s> at offset %d < %d; ' 'add %d delay', sibling.name, child.name, sibling.st_delay + next( reversed(child.ld_offsets[sibling.name])) - stage_offset, child.st_delay, delay) else: _logger.debug( 'tensor %s arrives at tensor <%s> at offset %d = %d; good', sibling.name, child.name, sibling.st_delay + next( reversed(child.ld_offsets[sibling.name])) - stage_offset, child.st_delay) child.ld_delays[sibling.name] = max(delay, 0) _logger.debug('set delay of |%s <- %s| to %d' % (child.name, sibling.name, child.ld_delays[sibling.name])) processing_queue.append(child.name) processed_tensors.add(child.name) chronological_tensors.append(child) else: for parent in tensor.parents.values(): if parent.name not in processed_tensors: _logger.debug( 'tensor %s requires tensor <%s> as an input', tensor.name, parent.name) _logger.debug( 'but tensor <%s> isn\'t processed yet', parent.name) _logger.debug('add %s to scheduling queue', parent.name) processing_queue.append(parent.name) _logger.debug('tensors in insertion order: [%s]', ', '.join(map(str, self.tensors))) _logger.debug('tensors in chronological order: [%s]', ', '.join(t.name for t in chronological_tensors)) for tensor in self.tensors.values(): for name, indices in tensor.ld_indices.items(): _logger.debug( 'stage index: %s@%s <- %s@%s', tensor.name, util.idx2str(tensor.st_idx), name, util.lst2str(util.idx2str(idx) for idx in indices)) for tensor in self.tensors.values(): if tensor.is_input(): continue _logger.debug('stage expr: %s = %s', tensor.st_ref, tensor.expr) for tensor in self.tensors.values(): for name, offsets in tensor.ld_offsets.items(): _logger.debug( 'stage offset: %s@%d <- %s@%s', tensor.name, soda_util.serialize(tensor.st_idx, self.tile_size), name, util.lst2str(offsets)) for tensor in self.tensors.values(): for name, delay in tensor.ld_delays.items(): _logger.debug('stage delay: %s <- %s delayed %d' % (tensor.name, name, delay)) return chronological_tensors
def tensors(self): """Constructs high-level DAG and creates the tensors. Returns: An collections.OrderedDict mapping a tensor's name to the tensor. """ # TODO: check for name conflicts tensor_map = collections.OrderedDict() for stmt in self.input_stmts: tensor = Tensor(stmt, self.tile_size) tensor_map[stmt.name] = tensor def name_in_iter(name, iteration): if name in self.input_names: if iteration > 0: return name + '_iter%d' % iteration return name if name in self.output_names: if iteration < self.iterate - 1: return (self.input_names[self.output_names.index(name)] + '_iter%d' % (iteration + 1)) return name if name in self.local_names: if iteration > 0: return name + '_iter%d' % iteration return name if name in self.param_names: return name raise util.InternalError('unknown name: %s' % name) for iteration in range(self.iterate): _logger.debug('iterate %s', iteration) _logger.debug('map: %s', self.symbol_table) def mutate_name_callback(obj, mutated): if isinstance(obj, ir.Ref): obj.haoda_type = self.symbol_table[obj.name] # pylint: disable=cell-var-from-loop obj.name = name_in_iter(obj.name, iteration) return obj tensors = [] for stmt in itertools.chain(self.local_stmts, self.output_stmts): tensor = Tensor(stmt.visit(mutate_name_callback), self.tile_size) loads = visitor.get_load_tuple(tensor) norm_idx = tuple( min(load.idx[d] for load in loads if load.name not in self.param_names) for d in range(self.dim)) if any(norm_idx): _logger.debug('normalize index of %s: (%s)', tensor.name, ', '.join(map(str, norm_idx))) mutator.shift(tensor, norm_idx, excluded=self.param_names) tensor_map[tensor.name] = tensor tensors.append(tensor) for tensor in tensors: _logger.debug('%s', tensor) for tensor in tensors: tensor.propagate_type() loads = visitor.get_load_dict(tensor) for parent_name, ld_refs in loads.items(): ld_refs = sorted(ld_refs, key=lambda ref: soda_util.serialize( ref.idx, self.tile_size)) parent_tensor = tensor_map[parent_name] parent_tensor.children[tensor.name] = tensor tensor.parents[parent_name] = parent_tensor tensor.ld_refs[parent_name] = ld_refs # high-level DAG construction finished for tensor in tensor_map.values(): if tensor.name in self.input_names: _logger.debug('<input tensor>: %s', tensor) elif tensor.name in self.output_names: _logger.debug('<output tensor>: %s', tensor) else: _logger.debug('<local tensor>: %s', tensor) return tensor_map
def print_func(printer: util.CppPrinter, stencil: soda.core.Stencil): stmts = stencil.input_stmts + stencil.output_stmts # factories for meta variables data_fmt = util.MetaFmt('var_%s_ptr') extent_fmt = util.MetaFmt('var_%s_extent') stride_fmt = util.MetaFmt('var_%s_stride') min_fmt = util.MetaFmt('var_%s_min') # print function signature params: List[str] = [] for stmt in stmts + stencil.param_stmts: prefix = 'const ' if isinstance(stmt, grammar.InputStmt) else '' params.extend((f'{prefix}{TYPE_FMT[stmt.name]}* {data_fmt[stmt.name]}', f'const int32_t {extent_fmt[stmt.name]}[{stencil.dim}]', f'const int32_t {stride_fmt[stmt.name]}[{stencil.dim}]', f'const int32_t {min_fmt[stmt.name]}[{stencil.dim}]')) tile_size_fmt = util.MetaFmt('tile_size_%s') params.extend(( 'const char* bitstream', f'const int burst_width = {stencil.burst_width}', *(f'const int {tile_size_fmt[d]} = {stencil.tile_size[d]}' for d in range(stencil.dim - 1)), f'const int unroll_factor = {stencil.unroll_factor}', )) printer.print_func(name=f'int {stencil.app_name}', params=params, align=0) printer.do_scope() printer.printlns( '// load bitstream', 'auto instance = fpga::Instance(bitstream);', 'auto args_info = instance.GetArgsInfo();' '', ) bank_count_fmt = util.MetaFmt('bank_count_%s') regex_fmt = util.MetaFmt('regex_%s') elem_count_per_cycle_fmt = util.MetaFmt('elem_count_per_cycle_%s') tile_count_fmt = util.MetaFmt('tile_count_dim_%d') printer.printlns( '// find out how many banks are used for each tensor', *(f'int {bank_count_fmt[x.name]} = 0;' for x in stmts), *(f'const regex {regex_fmt[x.name]}' f'(R"(^bank_\\d+_{x.name}$)");' for x in stmts), ) with printer.for_('const auto& arg', 'args_info'): printer.printlns(f'if (regex_match(arg.name, {regex_fmt[x.name]})) ' f'++{bank_count_fmt[x.name]};' for x in stmts) printer.printlns( '', ('auto round_up = [](int64_t a, int64_t b) -> int64_t ' '{ return ((a - 1) / b + 1) * b; };'), '', '// some run-time constants', *(f'const int {elem_count_per_cycle_fmt[x.name]} = ' f'burst_width / {WIDTH_FMT[x.name]} * {bank_count_fmt[x.name]};' for x in stmts), ) for d in range(stencil.dim - 1): printer.println( f'int32_t {tile_count_fmt[d]} = ' f'({extent_fmt[stencil.input_names[0]]}[{d}] - ' f'{STENCIL_DIM_FMT[d]} + 1 - 1) / ({tile_size_fmt[d]} - ' f'{STENCIL_DIM_FMT[d]} + 1) + 1;') printer.printlns( ('int64_t tile_count = %s;' % ' * '.join(f'{tile_count_fmt[d]}' for d in range(stencil.dim - 1))), '', ) printer.printlns( '// align each linearized tile to multiples of burst_width', ('int64_t elem_count_per_tile = %s * ' f'{extent_fmt[stencil.input_names[0]]}[{stencil.dim - 1}];' % ' * '.join(f'{tile_size_fmt[d]}' for d in range(stencil.dim - 1))), ('int64_t cycle_count_per_tile = (elem_count_per_tile - 1) / ' f'{elem_count_per_cycle_fmt[stencil.input_names[0]]} + 1;'), ('int64_t elem_count_aligned_per_tile_i = cycle_count_per_tile * ' f'{elem_count_per_cycle_fmt[stencil.input_stmts[0].name]};'), ('int64_t elem_count_aligned_per_tile_o = cycle_count_per_tile * ' f'{elem_count_per_cycle_fmt[stencil.output_stmts[0].name]};'), '', ) printer.println('// calculate size of each buffer') buf_size_fmt = util.MetaFmt('buf_size_%s') for stmt in stencil.input_stmts: printer.println( f'int64_t {buf_size_fmt[stmt.name]} = ' f'(tile_count * elem_count_aligned_per_tile_i + ' f'round_up(kStencilDistance, {elem_count_per_cycle_fmt[stmt.name]}))' f' / {bank_count_fmt[stmt.name]} * sizeof({TYPE_FMT[stmt.name]});') for stmt in stencil.output_stmts: printer.println( f'int64_t {buf_size_fmt[stmt.name]} = ' f'(tile_count * elem_count_aligned_per_tile_o + ' f'round_up(kStencilDistance, {elem_count_per_cycle_fmt[stmt.name]}))' f' / {bank_count_fmt[stmt.name]} * sizeof({TYPE_FMT[stmt.name]});') printer.println() printer.println('// allocate memory for each buffer') buf_fmt = util.MetaFmt('buf_%s') for stmt in stmts: printer.printlns( (f'vector<unique_ptr<{TYPE_FMT[stmt.name]}, decltype(&free)>> ' f'{buf_fmt[stmt.name]};'), f'{buf_fmt[stmt.name]}.reserve({bank_count_fmt[stmt.name]});', ) with printer.for_('int bank = 0', f'bank < {bank_count_fmt[stmt.name]}', '++bank'): printer.println( f'{buf_fmt[stmt.name]}.emplace_back(' f'static_cast<{TYPE_FMT[stmt.name]}*>(aligned_alloc(' f'4096, round_up({buf_size_fmt[stmt.name]}, 4096))), &free);') printer.println() printer.println('// tiling') for dim in range(stencil.dim - 2, -1, -1): printer.println(f'for(int32_t tile_index_dim_{dim} = 0; ' f'tile_index_dim_{dim} < {tile_count_fmt[dim]}; ' f'++tile_index_dim_{dim})') printer.do_scope() printer.println( f'int32_t actual_tile_size_dim_{dim} = ' f'(tile_index_dim_{dim}=={tile_count_fmt[dim]}-1) ? ' f'{extent_fmt[stencil.input_names[0]]}[{dim}] - ' f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]} + 1) * ' f'tile_index_dim_{dim} : {tile_size_fmt[dim]};') printer.println('#pragma omp parallel for', 0) var = soda_util.COORDS_IN_TILE[stencil.dim - 1] printer.println( f'for(int32_t {var} = 0; ' f'{var} < {extent_fmt[stencil.input_names[0]]}[{stencil.dim - 1}]; ' f'++{var})') printer.do_scope() for dim in range(stencil.dim - 2, -1, -1): printer.println('for(int32_t {0} = 0; {0} < actual_tile_size_dim_{1}; ' '++{0})'.format(soda_util.COORDS_IN_TILE[dim], dim)) printer.do_scope() printer.printlns( ('// (%s) is coordinates in tiled image' % ', '.join(soda_util.COORDS_TILED)), ('// (%s) is coordinates in original image' % ', '.join(soda_util.COORDS_IN_ORIG)), '// (%s) is coordinates in a tile' % ', '.join(soda_util.COORDS_IN_TILE), ) offset_in_tile = ' + '.join( '%s%s' % (soda_util.COORDS_IN_TILE[x], ''.join(f' * {tile_size_fmt[d]}' for d in range(x))) for x in range(stencil.dim)) stmt = stencil.input_stmts[0] printer.printlns( (f'int32_t burst_index = ({offset_in_tile}) / ' f'{elem_count_per_cycle_fmt[stmt.name]};'), (f'int32_t burst_residue = ({offset_in_tile}) % ' f'{elem_count_per_cycle_fmt[stmt.name]};'), ) for dim in range(stencil.dim - 1): printer.println( f'int32_t {soda_util.COORDS_IN_ORIG[dim]} = tile_index_dim_{dim} * ' f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]}) + ' f'{soda_util.COORDS_IN_TILE[dim]};') printer.printlns( ('int32_t %s = %s;' % (soda_util.COORDS_IN_ORIG[stencil.dim - 1], soda_util.COORDS_IN_TILE[stencil.dim - 1])), (f'int64_t tiled_offset = (%s) * elem_count_aligned_per_tile_i + ' f'burst_index * {elem_count_per_cycle_fmt[stmt.name]} + burst_residue;' % ' + '.join('%stile_index_dim_%d' % (''.join(f'{tile_count_fmt[d]} * ' for d in range(x)), x) for x in range(stencil.dim - 1))), ('int64_t original_offset = %s;' % ' + '.join(f'%s * {stride_fmt[stencil.input_names[0]]}[%d]' % (soda_util.COORDS_IN_ORIG[x], x) for x in range(stencil.dim))), ) printer.printlns(f'{buf_fmt[x]}' f'[tiled_offset % {bank_count_fmt[x]}].get()' f'[tiled_offset / {bank_count_fmt[x]}] = ' f'{data_fmt[x]}[std::max(int64_t(0), original_offset - ' f'{stencil.tensors[x].produce_offset})];' for x in stencil.input_names) for dim in range(stencil.dim * 2 - 1): printer.un_scope() printer.println() for d in range(stencil.dim - 1): printer.println( f'clog << "INFO: tile_count[{d}] = " << {tile_count_fmt[d]} ' f'<< ", tile_size[{d}] = " << {tile_size_fmt[d]} << endl;') for name in stencil.input_names + stencil.output_names: for item in 'extent', 'stride', 'min': fmt = locals()[item + '_fmt'] printer.println( 'clog << "INFO: %s" << endl;' % ', '.join(f'{name}.{item}[{d}] = " << {fmt[name]}[{d}] << "' for d in range(stencil.dim))) printer.println() stmt = stencil.input_stmts[0] printer.printlns( ('int64_t tile_data_count = ' f'((int64_t({extent_fmt[stmt.name]}[{stencil.dim - 1}])%s - 1) / ' f'{elem_count_per_cycle_fmt[stmt.name]} + 1) * ' f'{elem_count_per_cycle_fmt[stmt.name]} / ' 'unroll_factor;' % (''.join(f' * {tile_size_fmt[d]}' for d in range(stencil.dim - 1)))), ('int64_t cycle_count = ' f'((int64_t({extent_fmt[stmt.name]}[{stencil.dim - 1}])%s * %s + ' f'kStencilDistance - 1) / {elem_count_per_cycle_fmt[stmt.name]} + 1);' % (''.join(f' * {tile_size_fmt[d]}' for d in range(stencil.dim - 1)), ' * '.join( tile_count_fmt[d] for d in range(stencil.dim - 1)))), ('clog << "INFO: tile_data_count = " << tile_data_count ' '<< ", cycle_count = " << cycle_count << endl;'), '', ) printer.println('int arg_idx = 0;') iter_fmt = util.MetaFmt('iter_%s') for stmt in stmts: printer.println( f'auto {iter_fmt[stmt.name]} = {buf_fmt[stmt.name]}.begin();') with printer.for_('const auto& arg', 'args_info'): with printer.if_('arg.name == "coalesced_data_num"'): printer.printlns( 'instance.SetArg(arg_idx, cycle_count);', '++arg_idx;', ) for stmt in stmts: direction = 'Write' if isinstance( stmt, grammar.InputStmt) else 'Read' with printer.elif_( f'regex_match(arg.name, {regex_fmt[stmt.name]})'): printer.printlns( (f'auto buf = fpga::{direction}Only(' f'{iter_fmt[stmt.name]}->get(), ' f'{buf_size_fmt[stmt.name]} / sizeof({TYPE_FMT[stmt.name]}));' ), 'instance.AllocBuf(arg_idx, buf);', 'instance.SetArg(arg_idx, buf);', f'++{iter_fmt[stmt.name]};', '++arg_idx;', ) printer.printlns( '', 'instance.WriteToDevice();', 'instance.Exec();', 'instance.ReadFromDevice();', 'instance.Finish();', '', ('clog << "Load throughput: " << std::setprecision(3) ' '<< instance.LoadThroughputGbps() << " GB/s" << endl;'), ('clog << "Compute latency: " << std::setprecision(3) ' '<< instance.ComputeTimeSeconds() << " s" << endl;'), ('clog << "Store throughput: " << std::setprecision(3) ' '<< instance.StoreThroughputGbps() <<" GB/s" << endl;'), '', ) for dim in range(stencil.dim - 2, -1, -1): printer.println( f'for(int32_t tile_index_dim_{dim} = 0; tile_index_dim_{dim} < ' f'{tile_count_fmt[dim]}; ++tile_index_dim_{dim})') printer.do_scope() printer.println(f'int32_t actual_tile_size_dim_{dim} = ' f'(tile_index_dim_{dim} == {tile_count_fmt[dim]}-1) ? ' f'{extent_fmt[stencil.input_names[0]]}[{dim}] - ' f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]} + 1)' f' * tile_index_dim_{dim} : {tile_size_fmt[dim]};') overall_stencil_window = core.get_overall_stencil_window( stencil.tensors[stencil.input_names[0]], stencil.tensors[stencil.output_names[0]]) overall_stencil_offset = core.get_stencil_window_offset( overall_stencil_window) overall_stencil_dim = core.get_stencil_dim(overall_stencil_window) printer.println('#pragma omp parallel for', 0) printer.println('for(int32_t {var} = {}; {var} < ' f'{extent_fmt[stencil.output_names[0]]}[{stencil.dim - 1}]' ' - {}; ++{var})'.format( max(0, overall_stencil_offset[stencil.dim - 1]), max(0, (overall_stencil_dim[stencil.dim - 1] - 1 - overall_stencil_offset[stencil.dim - 1])), var=soda_util.COORDS_IN_TILE[stencil.dim - 1])) printer.do_scope() for dim in range(stencil.dim - 2, -1, -1): printer.println( 'for(int32_t {var} = {}; {var} < actual_tile_size_dim_{} - {}; ' '++{var})'.format(max(0, overall_stencil_offset[dim]), dim, max( 0, overall_stencil_dim[dim] - 1 - overall_stencil_offset[dim]), var=soda_util.COORDS_IN_TILE[dim])) printer.do_scope() printer.printlns( ('// (%s) is coordinates in tiled image' % ', '.join(soda_util.COORDS_TILED)), ('// (%s) is coordinates in original image' % ', '.join(soda_util.COORDS_IN_ORIG)), '// (%s) is coordinates in a tile' % ', '.join(soda_util.COORDS_IN_TILE), ) offset_in_tile = ' + '.join( '%s%s' % (soda_util.COORDS_IN_TILE[x], ''.join(f' * {tile_size_fmt[d]}' for d in range(x))) for x in range(stencil.dim)) for dim in range(stencil.dim - 1): printer.println( f'int32_t {soda_util.COORDS_IN_ORIG[dim]} = tile_index_dim_{dim} ' f'* ({tile_size_fmt[dim]}-{STENCIL_DIM_FMT[dim]} + 1) + ' f'{soda_util.COORDS_IN_TILE[dim]};') printer.printlns( ('int32_t %s = %s;' % (soda_util.COORDS_IN_ORIG[stencil.dim - 1], soda_util.COORDS_IN_TILE[stencil.dim - 1])), ('int64_t original_offset = %s;' % ' + '.join(f'%s * {stride_fmt[stencil.output_names[0]]}[%d]' % (soda_util.COORDS_IN_ORIG[x], x) for x in range(stencil.dim))), ) for stmt in stencil.output_stmts: overall_stencil_window = core.get_overall_stencil_window( map(stencil.tensors.get, stencil.input_names), stencil.tensors[stmt.name]) overall_stencil_distance = core.get_stencil_distance( overall_stencil_window, stencil.tile_size) stencil_offset = overall_stencil_distance - soda_util.serialize( core.get_stencil_window_offset(overall_stencil_window), stencil.tile_size) printer.printlns( (f'int32_t burst_index_{stmt.name} = ' f'({offset_in_tile} + {stencil_offset}) / ' f'{elem_count_per_cycle_fmt[stmt.name]};'), (f'int32_t burst_residue_{stmt.name} = ' f'({offset_in_tile} + {stencil_offset}) % ' f'{elem_count_per_cycle_fmt[stmt.name]};'), (f'int64_t tiled_offset_{stmt.name} = ' f'(%s) * elem_count_aligned_per_tile_o + burst_index_{stmt.name} * ' f'{elem_count_per_cycle_fmt[stmt.name]} + burst_residue_{stmt.name};' % ('+'.join('%stile_index_dim_%d' % (''.join(f'{tile_count_fmt[d]} * ' for d in range(x)), x) for x in range(stencil.dim - 1)))), (f'{data_fmt[stmt.name]}[original_offset] = {buf_fmt[stmt.name]}' f'[tiled_offset_{stmt.name} % {bank_count_fmt[stmt.name]}].get()' f'[tiled_offset_{stmt.name} / {bank_count_fmt[stmt.name]}];'), ) for dim in range(stencil.dim * 2 - 1): printer.un_scope() printer.println('return 0;') printer.un_scope() printer.println()