Пример #1
0
 def sync(tensor, offset):
     if tensor is None:
         return offset
     _logger.debug('index of tensor <%s>: %s', tensor.name,
                   tensor.st_idx)
     stage_offset = soda_util.serialize(
         tensor.st_idx, self.tile_size)
     _logger.debug('offset of tensor <%s>: %d', tensor.name,
                   stage_offset)
     loads = visitor.get_load_dict(tensor)
     for name in loads:
         loads[name] = tuple(ref.idx for ref in loads[name])
     _logger.debug(
         'loads: %s', ', '.join(
             '%s@%s' %
             (name,
              util.lst2str(map(util.idx2str, indices)))
             for name, indices in loads.items()))
     for n in loads:
         loads[n] = soda_util.serialize_iter(
             loads[n], self.tile_size)
     for l in loads.values():
         l[0], l[-1] = (stage_offset - max(l),
                        stage_offset - min(l))
         del l[1:-1]
         if len(l) == 1:
             l.append(l[-1])
     _logger.debug(
         'load offset range in tensor %s: %s', tensor.name,
         '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v)
                             for n, v in loads.items())))
     for parent in tensor.parents.values():
         tensor_distance = next(
             reversed(tensor.ld_offsets[parent.name]))
         _logger.debug('tensor distance: %s',
                       tensor_distance)
         _logger.debug(
             'want to access tensor <%s> at offset [%d, %d] '
             'to generate tensor <%s> at offset %d',
             parent.name, offset + loads[parent.name][0],
             offset + loads[parent.name][-1], tensor.name,
             offset)
         tensor_offset = (parent.st_delay +
                          tensor_distance - stage_offset)
         if offset < tensor_offset:
             _logger.debug(
                 'but tensor <%s> won\'t be available until offset %d',
                 parent.name, tensor_offset)
             offset = tensor_offset
             _logger.debug(
                 'need to access tensor <%s> at offset [%d, %d] '
                 'to generate tensor <%s> at offset %d',
                 parent.name,
                 offset + loads[parent.name][0],
                 offset + loads[parent.name][-1],
                 tensor.name, offset)
     return offset
Пример #2
0
 def _get_window_for(self, node):
     loads = node.get_loads()  # [Load, ...]
     load_names = {l.name for l in loads if l.name not in self.extra_params}
     windows = {
         name: sorted({l.idx
                       for l in loads if l.name == name},
                      key=lambda x: soda_util.serialize(x, self.tile_size))
         for name in load_names
     }
     _logger.debug(
         'window for %s@(%s) is %s' %
         (node.name, ', '.join(map(str, node.expr[0].idx)), windows))
     return windows
Пример #3
0
 def test_deserialize(self):
     idx = (42, 23, 233)
     tile_size = (2333, 233, 0)
     self.assertTupleEqual(
         idx,
         tuple(util.deserialize(util.serialize(idx, tile_size), tile_size)))
Пример #4
0
 def ld_offsets(self):
     return collections.OrderedDict(
         (name,
          collections.OrderedDict(
              (soda_util.serialize(ref.idx, self._tile_size), ref)
              for ref in refs)) for name, refs in self.ld_refs.items())
Пример #5
0
 def st_offset(self):
     return soda_util.serialize(self.st_idx,
                                self._tile_size) + self.st_delay
Пример #6
0
def get_stencil_distance(stencil_window, tile_size):
    return (max(soda_util.serialize_iter(stencil_window, tile_size)) +
            soda_util.serialize(get_stencil_window_offset(stencil_window),
                                tile_size))
Пример #7
0
    def chronological_tensors(self):
        """Computes the offsets of tensors.

    Returns:
      A list of Tensor, in chronological order.
    """
        _logger.info('calculate tensor offsets')
        processing_queue = collections.deque(list(self.input_names))
        processed_tensors = set(self.input_names)
        chronological_tensors = list(map(self.tensors.get, self.input_names))
        for tensor in chronological_tensors:
            _logger.debug('tensor <%s> is at offset %d' %
                          (tensor.name, tensor.st_offset))
        _logger.debug('processing queue: %s', processing_queue)
        _logger.debug('processed_tensors: %s', processed_tensors)
        while processing_queue:
            tensor = self.tensors[processing_queue.popleft()]
            _logger.debug('inspecting tensor %s\'s children' % tensor.name)
            for child in tensor.children.values():
                if ({x.name
                     for x in child.parents.values()} <= processed_tensors
                        and child.name not in processed_tensors):
                    # good, all inputs are processed
                    # can determine offset of current tensor
                    _logger.debug(
                        'input%s for tensor <%s> (i.e. %s) %s processed',
                        '' if len(child.parents) == 1 else 's', child.name,
                        ', '.join([x.name for x in child.parents.values()]),
                        'is' if len(child.parents) == 1 else 'are')
                    stage_offset = soda_util.serialize(child.st_idx,
                                                       self.tile_size)

                    # synchronization check
                    def sync(tensor, offset):
                        if tensor is None:
                            return offset
                        _logger.debug('index of tensor <%s>: %s', tensor.name,
                                      tensor.st_idx)
                        stage_offset = soda_util.serialize(
                            tensor.st_idx, self.tile_size)
                        _logger.debug('offset of tensor <%s>: %d', tensor.name,
                                      stage_offset)
                        loads = visitor.get_load_dict(tensor)
                        for name in loads:
                            loads[name] = tuple(ref.idx for ref in loads[name])
                        _logger.debug(
                            'loads: %s', ', '.join(
                                '%s@%s' %
                                (name,
                                 util.lst2str(map(util.idx2str, indices)))
                                for name, indices in loads.items()))
                        for n in loads:
                            loads[n] = soda_util.serialize_iter(
                                loads[n], self.tile_size)
                        for l in loads.values():
                            l[0], l[-1] = (stage_offset - max(l),
                                           stage_offset - min(l))
                            del l[1:-1]
                            if len(l) == 1:
                                l.append(l[-1])
                        _logger.debug(
                            'load offset range in tensor %s: %s', tensor.name,
                            '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v)
                                                for n, v in loads.items())))
                        for parent in tensor.parents.values():
                            tensor_distance = next(
                                reversed(tensor.ld_offsets[parent.name]))
                            _logger.debug('tensor distance: %s',
                                          tensor_distance)
                            _logger.debug(
                                'want to access tensor <%s> at offset [%d, %d] '
                                'to generate tensor <%s> at offset %d',
                                parent.name, offset + loads[parent.name][0],
                                offset + loads[parent.name][-1], tensor.name,
                                offset)
                            tensor_offset = (parent.st_delay +
                                             tensor_distance - stage_offset)
                            if offset < tensor_offset:
                                _logger.debug(
                                    'but tensor <%s> won\'t be available until offset %d',
                                    parent.name, tensor_offset)
                                offset = tensor_offset
                                _logger.debug(
                                    'need to access tensor <%s> at offset [%d, %d] '
                                    'to generate tensor <%s> at offset %d',
                                    parent.name,
                                    offset + loads[parent.name][0],
                                    offset + loads[parent.name][-1],
                                    tensor.name, offset)
                        return offset

                    _logger.debug(
                        'intend to generate tensor <%s> at offset %d',
                        child.name, child.st_delay)
                    synced_offset = sync(child, child.st_delay)
                    _logger.debug('synced offset: %s', synced_offset)
                    child.st_delay = synced_offset
                    _logger.debug(
                        'decide to generate tensor <%s> at offset %d',
                        child.name, child.st_delay)

                    # add delay
                    for sibling in child.parents.values():
                        delay = child.st_delay - (sibling.st_delay + list(
                            child.ld_offsets[sibling.name].keys())[-1] -
                                                  stage_offset)
                        if delay > 0:
                            _logger.debug(
                                'tensor %s arrives at tensor <%s> at offset %d < %d; '
                                'add %d delay', sibling.name, child.name,
                                sibling.st_delay + next(
                                    reversed(child.ld_offsets[sibling.name])) -
                                stage_offset, child.st_delay, delay)
                        else:
                            _logger.debug(
                                'tensor %s arrives at tensor <%s> at offset %d = %d; good',
                                sibling.name, child.name,
                                sibling.st_delay + next(
                                    reversed(child.ld_offsets[sibling.name])) -
                                stage_offset, child.st_delay)
                        child.ld_delays[sibling.name] = max(delay, 0)
                        _logger.debug('set delay of |%s <- %s| to %d' %
                                      (child.name, sibling.name,
                                       child.ld_delays[sibling.name]))

                    processing_queue.append(child.name)
                    processed_tensors.add(child.name)
                    chronological_tensors.append(child)
                else:
                    for parent in tensor.parents.values():
                        if parent.name not in processed_tensors:
                            _logger.debug(
                                'tensor %s requires tensor <%s> as an input',
                                tensor.name, parent.name)
                            _logger.debug(
                                'but tensor <%s> isn\'t processed yet',
                                parent.name)
                            _logger.debug('add %s to scheduling queue',
                                          parent.name)
                            processing_queue.append(parent.name)

        _logger.debug('tensors in insertion order: [%s]',
                      ', '.join(map(str, self.tensors)))
        _logger.debug('tensors in chronological order: [%s]',
                      ', '.join(t.name for t in chronological_tensors))

        for tensor in self.tensors.values():
            for name, indices in tensor.ld_indices.items():
                _logger.debug(
                    'stage index: %s@%s <- %s@%s', tensor.name,
                    util.idx2str(tensor.st_idx), name,
                    util.lst2str(util.idx2str(idx) for idx in indices))
        for tensor in self.tensors.values():
            if tensor.is_input():
                continue
            _logger.debug('stage expr: %s = %s', tensor.st_ref, tensor.expr)
        for tensor in self.tensors.values():
            for name, offsets in tensor.ld_offsets.items():
                _logger.debug(
                    'stage offset: %s@%d <- %s@%s', tensor.name,
                    soda_util.serialize(tensor.st_idx, self.tile_size), name,
                    util.lst2str(offsets))
        for tensor in self.tensors.values():
            for name, delay in tensor.ld_delays.items():
                _logger.debug('stage delay: %s <- %s delayed %d' %
                              (tensor.name, name, delay))

        return chronological_tensors
Пример #8
0
    def tensors(self):
        """Constructs high-level DAG and creates the tensors.

    Returns:
      An collections.OrderedDict mapping a tensor's name to the tensor.
    """
        # TODO: check for name conflicts
        tensor_map = collections.OrderedDict()
        for stmt in self.input_stmts:
            tensor = Tensor(stmt, self.tile_size)
            tensor_map[stmt.name] = tensor

        def name_in_iter(name, iteration):
            if name in self.input_names:
                if iteration > 0:
                    return name + '_iter%d' % iteration
                return name
            if name in self.output_names:
                if iteration < self.iterate - 1:
                    return (self.input_names[self.output_names.index(name)] +
                            '_iter%d' % (iteration + 1))
                return name
            if name in self.local_names:
                if iteration > 0:
                    return name + '_iter%d' % iteration
                return name
            if name in self.param_names:
                return name
            raise util.InternalError('unknown name: %s' % name)

        for iteration in range(self.iterate):
            _logger.debug('iterate %s', iteration)
            _logger.debug('map: %s', self.symbol_table)

            def mutate_name_callback(obj, mutated):
                if isinstance(obj, ir.Ref):
                    obj.haoda_type = self.symbol_table[obj.name]
                    # pylint: disable=cell-var-from-loop
                    obj.name = name_in_iter(obj.name, iteration)
                return obj

            tensors = []
            for stmt in itertools.chain(self.local_stmts, self.output_stmts):
                tensor = Tensor(stmt.visit(mutate_name_callback),
                                self.tile_size)
                loads = visitor.get_load_tuple(tensor)
                norm_idx = tuple(
                    min(load.idx[d] for load in loads
                        if load.name not in self.param_names)
                    for d in range(self.dim))
                if any(norm_idx):
                    _logger.debug('normalize index of %s: (%s)', tensor.name,
                                  ', '.join(map(str, norm_idx)))
                    mutator.shift(tensor, norm_idx, excluded=self.param_names)
                tensor_map[tensor.name] = tensor
                tensors.append(tensor)

            for tensor in tensors:
                _logger.debug('%s', tensor)

            for tensor in tensors:
                tensor.propagate_type()
                loads = visitor.get_load_dict(tensor)
                for parent_name, ld_refs in loads.items():
                    ld_refs = sorted(ld_refs,
                                     key=lambda ref: soda_util.serialize(
                                         ref.idx, self.tile_size))
                    parent_tensor = tensor_map[parent_name]
                    parent_tensor.children[tensor.name] = tensor
                    tensor.parents[parent_name] = parent_tensor
                    tensor.ld_refs[parent_name] = ld_refs

        # high-level DAG construction finished
        for tensor in tensor_map.values():
            if tensor.name in self.input_names:
                _logger.debug('<input tensor>: %s', tensor)
            elif tensor.name in self.output_names:
                _logger.debug('<output tensor>: %s', tensor)
            else:
                _logger.debug('<local tensor>: %s', tensor)
        return tensor_map
Пример #9
0
def print_func(printer: util.CppPrinter, stencil: soda.core.Stencil):
    stmts = stencil.input_stmts + stencil.output_stmts

    # factories for meta variables
    data_fmt = util.MetaFmt('var_%s_ptr')
    extent_fmt = util.MetaFmt('var_%s_extent')
    stride_fmt = util.MetaFmt('var_%s_stride')
    min_fmt = util.MetaFmt('var_%s_min')

    # print function signature
    params: List[str] = []
    for stmt in stmts + stencil.param_stmts:
        prefix = 'const ' if isinstance(stmt, grammar.InputStmt) else ''
        params.extend((f'{prefix}{TYPE_FMT[stmt.name]}* {data_fmt[stmt.name]}',
                       f'const int32_t {extent_fmt[stmt.name]}[{stencil.dim}]',
                       f'const int32_t {stride_fmt[stmt.name]}[{stencil.dim}]',
                       f'const int32_t {min_fmt[stmt.name]}[{stencil.dim}]'))

    tile_size_fmt = util.MetaFmt('tile_size_%s')
    params.extend((
        'const char* bitstream',
        f'const int burst_width = {stencil.burst_width}',
        *(f'const int {tile_size_fmt[d]} = {stencil.tile_size[d]}'
          for d in range(stencil.dim - 1)),
        f'const int unroll_factor = {stencil.unroll_factor}',
    ))
    printer.print_func(name=f'int {stencil.app_name}', params=params, align=0)
    printer.do_scope()

    printer.printlns(
        '// load bitstream',
        'auto instance = fpga::Instance(bitstream);',
        'auto args_info = instance.GetArgsInfo();'
        '',
    )

    bank_count_fmt = util.MetaFmt('bank_count_%s')
    regex_fmt = util.MetaFmt('regex_%s')
    elem_count_per_cycle_fmt = util.MetaFmt('elem_count_per_cycle_%s')
    tile_count_fmt = util.MetaFmt('tile_count_dim_%d')
    printer.printlns(
        '// find out how many banks are used for each tensor',
        *(f'int {bank_count_fmt[x.name]} = 0;' for x in stmts),
        *(f'const regex {regex_fmt[x.name]}'
          f'(R"(^bank_\\d+_{x.name}$)");' for x in stmts),
    )
    with printer.for_('const auto& arg', 'args_info'):
        printer.printlns(f'if (regex_match(arg.name, {regex_fmt[x.name]})) '
                         f'++{bank_count_fmt[x.name]};' for x in stmts)
    printer.printlns(
        '',
        ('auto round_up = [](int64_t a, int64_t b) -> int64_t '
         '{ return ((a - 1) / b + 1) * b; };'),
        '',
        '// some run-time constants',
        *(f'const int {elem_count_per_cycle_fmt[x.name]} = '
          f'burst_width / {WIDTH_FMT[x.name]} * {bank_count_fmt[x.name]};'
          for x in stmts),
    )
    for d in range(stencil.dim - 1):
        printer.println(
            f'int32_t {tile_count_fmt[d]} = '
            f'({extent_fmt[stencil.input_names[0]]}[{d}] - '
            f'{STENCIL_DIM_FMT[d]} + 1 - 1) / ({tile_size_fmt[d]} - '
            f'{STENCIL_DIM_FMT[d]} + 1) + 1;')
    printer.printlns(
        ('int64_t tile_count = %s;' %
         ' * '.join(f'{tile_count_fmt[d]}' for d in range(stencil.dim - 1))),
        '',
    )

    printer.printlns(
        '// align each linearized tile to multiples of burst_width',
        ('int64_t elem_count_per_tile = %s * '
         f'{extent_fmt[stencil.input_names[0]]}[{stencil.dim - 1}];' %
         ' * '.join(f'{tile_size_fmt[d]}' for d in range(stencil.dim - 1))),
        ('int64_t cycle_count_per_tile = (elem_count_per_tile - 1) / '
         f'{elem_count_per_cycle_fmt[stencil.input_names[0]]} + 1;'),
        ('int64_t elem_count_aligned_per_tile_i = cycle_count_per_tile * '
         f'{elem_count_per_cycle_fmt[stencil.input_stmts[0].name]};'),
        ('int64_t elem_count_aligned_per_tile_o = cycle_count_per_tile * '
         f'{elem_count_per_cycle_fmt[stencil.output_stmts[0].name]};'),
        '',
    )

    printer.println('// calculate size of each buffer')
    buf_size_fmt = util.MetaFmt('buf_size_%s')
    for stmt in stencil.input_stmts:
        printer.println(
            f'int64_t {buf_size_fmt[stmt.name]} = '
            f'(tile_count * elem_count_aligned_per_tile_i + '
            f'round_up(kStencilDistance, {elem_count_per_cycle_fmt[stmt.name]}))'
            f' / {bank_count_fmt[stmt.name]} * sizeof({TYPE_FMT[stmt.name]});')
    for stmt in stencil.output_stmts:
        printer.println(
            f'int64_t {buf_size_fmt[stmt.name]} = '
            f'(tile_count * elem_count_aligned_per_tile_o + '
            f'round_up(kStencilDistance, {elem_count_per_cycle_fmt[stmt.name]}))'
            f' / {bank_count_fmt[stmt.name]} * sizeof({TYPE_FMT[stmt.name]});')
    printer.println()

    printer.println('// allocate memory for each buffer')
    buf_fmt = util.MetaFmt('buf_%s')
    for stmt in stmts:
        printer.printlns(
            (f'vector<unique_ptr<{TYPE_FMT[stmt.name]}, decltype(&free)>> '
             f'{buf_fmt[stmt.name]};'),
            f'{buf_fmt[stmt.name]}.reserve({bank_count_fmt[stmt.name]});',
        )
        with printer.for_('int bank = 0',
                          f'bank < {bank_count_fmt[stmt.name]}', '++bank'):
            printer.println(
                f'{buf_fmt[stmt.name]}.emplace_back('
                f'static_cast<{TYPE_FMT[stmt.name]}*>(aligned_alloc('
                f'4096, round_up({buf_size_fmt[stmt.name]}, 4096))), &free);')
    printer.println()

    printer.println('// tiling')
    for dim in range(stencil.dim - 2, -1, -1):
        printer.println(f'for(int32_t tile_index_dim_{dim} = 0; '
                        f'tile_index_dim_{dim} < {tile_count_fmt[dim]}; '
                        f'++tile_index_dim_{dim})')
        printer.do_scope()
        printer.println(
            f'int32_t actual_tile_size_dim_{dim} = '
            f'(tile_index_dim_{dim}=={tile_count_fmt[dim]}-1) ? '
            f'{extent_fmt[stencil.input_names[0]]}[{dim}] - '
            f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]} + 1) * '
            f'tile_index_dim_{dim} : {tile_size_fmt[dim]};')

    printer.println('#pragma omp parallel for', 0)
    var = soda_util.COORDS_IN_TILE[stencil.dim - 1]
    printer.println(
        f'for(int32_t {var} = 0; '
        f'{var} < {extent_fmt[stencil.input_names[0]]}[{stencil.dim - 1}]; '
        f'++{var})')
    printer.do_scope()
    for dim in range(stencil.dim - 2, -1, -1):
        printer.println('for(int32_t {0} = 0; {0} < actual_tile_size_dim_{1}; '
                        '++{0})'.format(soda_util.COORDS_IN_TILE[dim], dim))
        printer.do_scope()

    printer.printlns(
        ('// (%s) is coordinates in tiled image' %
         ', '.join(soda_util.COORDS_TILED)),
        ('// (%s) is coordinates in original image' %
         ', '.join(soda_util.COORDS_IN_ORIG)),
        '// (%s) is coordinates in a tile' %
        ', '.join(soda_util.COORDS_IN_TILE),
    )
    offset_in_tile = ' + '.join(
        '%s%s' % (soda_util.COORDS_IN_TILE[x], ''.join(f' * {tile_size_fmt[d]}'
                                                       for d in range(x)))
        for x in range(stencil.dim))
    stmt = stencil.input_stmts[0]
    printer.printlns(
        (f'int32_t burst_index = ({offset_in_tile}) / '
         f'{elem_count_per_cycle_fmt[stmt.name]};'),
        (f'int32_t burst_residue = ({offset_in_tile}) % '
         f'{elem_count_per_cycle_fmt[stmt.name]};'),
    )
    for dim in range(stencil.dim - 1):
        printer.println(
            f'int32_t {soda_util.COORDS_IN_ORIG[dim]} = tile_index_dim_{dim} * '
            f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]}) + '
            f'{soda_util.COORDS_IN_TILE[dim]};')
    printer.printlns(
        ('int32_t %s = %s;' % (soda_util.COORDS_IN_ORIG[stencil.dim - 1],
                               soda_util.COORDS_IN_TILE[stencil.dim - 1])),
        (f'int64_t tiled_offset = (%s) * elem_count_aligned_per_tile_i + '
         f'burst_index * {elem_count_per_cycle_fmt[stmt.name]} + burst_residue;'
         % ' + '.join('%stile_index_dim_%d' %
                      (''.join(f'{tile_count_fmt[d]} * ' for d in range(x)), x)
                      for x in range(stencil.dim - 1))),
        ('int64_t original_offset = %s;' %
         ' + '.join(f'%s * {stride_fmt[stencil.input_names[0]]}[%d]' %
                    (soda_util.COORDS_IN_ORIG[x], x)
                    for x in range(stencil.dim))),
    )
    printer.printlns(f'{buf_fmt[x]}'
                     f'[tiled_offset % {bank_count_fmt[x]}].get()'
                     f'[tiled_offset / {bank_count_fmt[x]}] = '
                     f'{data_fmt[x]}[std::max(int64_t(0), original_offset - '
                     f'{stencil.tensors[x].produce_offset})];'
                     for x in stencil.input_names)
    for dim in range(stencil.dim * 2 - 1):
        printer.un_scope()
    printer.println()

    for d in range(stencil.dim - 1):
        printer.println(
            f'clog << "INFO: tile_count[{d}] = " << {tile_count_fmt[d]} '
            f'<< ", tile_size[{d}] = " << {tile_size_fmt[d]} << endl;')
    for name in stencil.input_names + stencil.output_names:
        for item in 'extent', 'stride', 'min':
            fmt = locals()[item + '_fmt']
            printer.println(
                'clog << "INFO: %s" << endl;' %
                ', '.join(f'{name}.{item}[{d}] = " << {fmt[name]}[{d}] << "'
                          for d in range(stencil.dim)))
    printer.println()

    stmt = stencil.input_stmts[0]
    printer.printlns(
        ('int64_t tile_data_count = '
         f'((int64_t({extent_fmt[stmt.name]}[{stencil.dim - 1}])%s - 1) / '
         f'{elem_count_per_cycle_fmt[stmt.name]} + 1) * '
         f'{elem_count_per_cycle_fmt[stmt.name]} / '
         'unroll_factor;' % (''.join(f' * {tile_size_fmt[d]}'
                                     for d in range(stencil.dim - 1)))),
        ('int64_t cycle_count = '
         f'((int64_t({extent_fmt[stmt.name]}[{stencil.dim - 1}])%s * %s + '
         f'kStencilDistance - 1) / {elem_count_per_cycle_fmt[stmt.name]} + 1);'
         % (''.join(f' * {tile_size_fmt[d]}'
                    for d in range(stencil.dim - 1)), ' * '.join(
                        tile_count_fmt[d] for d in range(stencil.dim - 1)))),
        ('clog << "INFO: tile_data_count = " << tile_data_count '
         '<< ", cycle_count = " << cycle_count << endl;'),
        '',
    )

    printer.println('int arg_idx = 0;')
    iter_fmt = util.MetaFmt('iter_%s')
    for stmt in stmts:
        printer.println(
            f'auto {iter_fmt[stmt.name]} = {buf_fmt[stmt.name]}.begin();')
    with printer.for_('const auto& arg', 'args_info'):
        with printer.if_('arg.name == "coalesced_data_num"'):
            printer.printlns(
                'instance.SetArg(arg_idx, cycle_count);',
                '++arg_idx;',
            )
            for stmt in stmts:
                direction = 'Write' if isinstance(
                    stmt, grammar.InputStmt) else 'Read'
                with printer.elif_(
                        f'regex_match(arg.name, {regex_fmt[stmt.name]})'):
                    printer.printlns(
                        (f'auto buf = fpga::{direction}Only('
                         f'{iter_fmt[stmt.name]}->get(), '
                         f'{buf_size_fmt[stmt.name]} / sizeof({TYPE_FMT[stmt.name]}));'
                         ),
                        'instance.AllocBuf(arg_idx, buf);',
                        'instance.SetArg(arg_idx, buf);',
                        f'++{iter_fmt[stmt.name]};',
                        '++arg_idx;',
                    )

    printer.printlns(
        '',
        'instance.WriteToDevice();',
        'instance.Exec();',
        'instance.ReadFromDevice();',
        'instance.Finish();',
        '',
        ('clog << "Load throughput: " << std::setprecision(3) '
         '<< instance.LoadThroughputGbps() << " GB/s" << endl;'),
        ('clog << "Compute latency: " << std::setprecision(3) '
         '<< instance.ComputeTimeSeconds() << " s" << endl;'),
        ('clog << "Store throughput: " << std::setprecision(3) '
         '<< instance.StoreThroughputGbps() <<" GB/s" << endl;'),
        '',
    )

    for dim in range(stencil.dim - 2, -1, -1):
        printer.println(
            f'for(int32_t tile_index_dim_{dim} = 0; tile_index_dim_{dim} < '
            f'{tile_count_fmt[dim]}; ++tile_index_dim_{dim})')
        printer.do_scope()
        printer.println(f'int32_t actual_tile_size_dim_{dim} = '
                        f'(tile_index_dim_{dim} == {tile_count_fmt[dim]}-1) ? '
                        f'{extent_fmt[stencil.input_names[0]]}[{dim}] - '
                        f'({tile_size_fmt[dim]} - {STENCIL_DIM_FMT[dim]} + 1)'
                        f' * tile_index_dim_{dim} : {tile_size_fmt[dim]};')

    overall_stencil_window = core.get_overall_stencil_window(
        stencil.tensors[stencil.input_names[0]],
        stencil.tensors[stencil.output_names[0]])
    overall_stencil_offset = core.get_stencil_window_offset(
        overall_stencil_window)
    overall_stencil_dim = core.get_stencil_dim(overall_stencil_window)
    printer.println('#pragma omp parallel for', 0)
    printer.println('for(int32_t {var} = {}; {var} < '
                    f'{extent_fmt[stencil.output_names[0]]}[{stencil.dim - 1}]'
                    ' - {}; ++{var})'.format(
                        max(0, overall_stencil_offset[stencil.dim - 1]),
                        max(0, (overall_stencil_dim[stencil.dim - 1] - 1 -
                                overall_stencil_offset[stencil.dim - 1])),
                        var=soda_util.COORDS_IN_TILE[stencil.dim - 1]))
    printer.do_scope()
    for dim in range(stencil.dim - 2, -1, -1):
        printer.println(
            'for(int32_t {var} = {}; {var} < actual_tile_size_dim_{} - {}; '
            '++{var})'.format(max(0, overall_stencil_offset[dim]),
                              dim,
                              max(
                                  0, overall_stencil_dim[dim] - 1 -
                                  overall_stencil_offset[dim]),
                              var=soda_util.COORDS_IN_TILE[dim]))
        printer.do_scope()

    printer.printlns(
        ('// (%s) is coordinates in tiled image' %
         ', '.join(soda_util.COORDS_TILED)),
        ('// (%s) is coordinates in original image' %
         ', '.join(soda_util.COORDS_IN_ORIG)),
        '// (%s) is coordinates in a tile' %
        ', '.join(soda_util.COORDS_IN_TILE),
    )
    offset_in_tile = ' + '.join(
        '%s%s' % (soda_util.COORDS_IN_TILE[x], ''.join(f' * {tile_size_fmt[d]}'
                                                       for d in range(x)))
        for x in range(stencil.dim))
    for dim in range(stencil.dim - 1):
        printer.println(
            f'int32_t {soda_util.COORDS_IN_ORIG[dim]} = tile_index_dim_{dim} '
            f'* ({tile_size_fmt[dim]}-{STENCIL_DIM_FMT[dim]} + 1) + '
            f'{soda_util.COORDS_IN_TILE[dim]};')
    printer.printlns(
        ('int32_t %s = %s;' % (soda_util.COORDS_IN_ORIG[stencil.dim - 1],
                               soda_util.COORDS_IN_TILE[stencil.dim - 1])),
        ('int64_t original_offset = %s;' %
         ' + '.join(f'%s * {stride_fmt[stencil.output_names[0]]}[%d]' %
                    (soda_util.COORDS_IN_ORIG[x], x)
                    for x in range(stencil.dim))),
    )
    for stmt in stencil.output_stmts:
        overall_stencil_window = core.get_overall_stencil_window(
            map(stencil.tensors.get, stencil.input_names),
            stencil.tensors[stmt.name])
        overall_stencil_distance = core.get_stencil_distance(
            overall_stencil_window, stencil.tile_size)
        stencil_offset = overall_stencil_distance - soda_util.serialize(
            core.get_stencil_window_offset(overall_stencil_window),
            stencil.tile_size)
        printer.printlns(
            (f'int32_t burst_index_{stmt.name} = '
             f'({offset_in_tile} + {stencil_offset}) / '
             f'{elem_count_per_cycle_fmt[stmt.name]};'),
            (f'int32_t burst_residue_{stmt.name} = '
             f'({offset_in_tile} + {stencil_offset}) % '
             f'{elem_count_per_cycle_fmt[stmt.name]};'),
            (f'int64_t tiled_offset_{stmt.name} = '
             f'(%s) * elem_count_aligned_per_tile_o + burst_index_{stmt.name} * '
             f'{elem_count_per_cycle_fmt[stmt.name]} + burst_residue_{stmt.name};'
             % ('+'.join('%stile_index_dim_%d' %
                         (''.join(f'{tile_count_fmt[d]} * '
                                  for d in range(x)), x)
                         for x in range(stencil.dim - 1)))),
            (f'{data_fmt[stmt.name]}[original_offset] = {buf_fmt[stmt.name]}'
             f'[tiled_offset_{stmt.name} % {bank_count_fmt[stmt.name]}].get()'
             f'[tiled_offset_{stmt.name} / {bank_count_fmt[stmt.name]}];'),
        )
    for dim in range(stencil.dim * 2 - 1):
        printer.un_scope()

    printer.println('return 0;')
    printer.un_scope()
    printer.println()