def __str__(self): return '''Tensor {haoda_type}: {name} = {expr} store: {st_ref} parents: {parents} children: {children}'''.format(name=self.name, haoda_type=self.haoda_type, expr=self.expr, parents=util.idx2str(self.parents), children=util.idx2str(self.children), st_ref=str(self.st_ref))
def _get_points(tile_size, tensor, unroll_factor): """Generates offset-to-point mapping for a Tensor. Generates a mapping which can be used to determine the accessed point index from the offset for a Tensor, under the given tile size and unroll factor. Args: tile_size: An iterable representing the tile size in each dimension. tensor: A Tensor to which the mapping belongs. unroll_factor: An int representing the unroll factor. Returns: A dict of name str to a dict of offset to a dict of unroll index to point index. """ all_points = {} # {name: {offset: {unroll_idx: point_idx}}} for child in tensor.children.values(): all_points[child.name] = {} offsets = child.ld_offsets[tensor.name] for unroll_idx in range(unroll_factor): for idx, offset in enumerate(offsets): all_points[child.name].setdefault( max(offsets) - offset + child.ld_delays[tensor.name] + unroll_idx, {})[unroll_factor - 1 - unroll_idx] = idx for child in tensor.children.values(): for offset, points in all_points[child.name].items(): for unroll_idx, point in points.items(): _logger.debug( '%s <- %s @ offset=%d <=> %s @ unroll_idx=%d', child.name, tensor.name, offset, util.idx2str( list(child.ld_indices[tensor.name].values()) [point].idx), unroll_idx) return all_points
def get_overall_stencil_window(input_tensor, output_tensor): if isinstance(input_tensor, collections.Iterable): all_points = tuple( sorted( set.union(*(set(get_overall_stencil_window(_, output_tensor)) for _ in input_tensor)))) _logger.debug( 'overall stencil window of %s (%s) <- {%s} is %s (%d points)', output_tensor.name, ', '.join(['0'] * len(output_tensor.st_idx)), ', '.join(_.name for _ in input_tensor), all_points, len(all_points)) return all_points # normalize store index to 0 idx = (id(input_tensor), id(output_tensor)) if idx in _overall_stencil_window_cache: return _overall_stencil_window_cache[idx] _logger.debug('get overall stencil window of %s <- %s', output_tensor.name, input_tensor.name) all_points = set() for name, points in output_tensor.ld_indices.items(): _logger.debug('%s@%s <- %s', output_tensor.name, util.idx2str(output_tensor.st_idx), util.idx2str(points.values())) if name != input_tensor.name: recursive_points = get_overall_stencil_window( input_tensor, output_tensor.parents[name]) _logger.debug('recursive points: %s', util.idx2str(recursive_points)) all_points |= set.union(*[{ tuple( map(lambda a, b, c: a + b - c, _, point, output_tensor.st_idx)) for _ in recursive_points } for point in points]) else: all_points |= { tuple(map(operator.sub, point, output_tensor.st_idx)) for point in points } all_points = tuple(sorted(all_points)) _logger.debug('overall stencil window of %s (%s) <- %s is %s (%d points)', output_tensor.name, ', '.join(['0'] * len(output_tensor.st_idx)), input_tensor.name, all_points, len(all_points)) _overall_stencil_window_cache[idx] = all_points return all_points
def __repr__(self): return '%s(loads: %s, lets: %s, exprs: %s)' % ( type(self).__name__, util.idx2str( self.loads), util.idx2str(self.lets), util.idx2str(self.exprs))
def create_dataflow_graph(stencil): chronological_tensors = stencil.chronological_tensors super_source = SuperSourceNode( fwd_nodes={}, cpt_nodes={}, super_sink=SuperSinkNode(), ) load_nodes = { stmt.name: tuple(LoadNode(var=stmt.name, bank=bank) for bank in stmt.dram) for stmt in stencil.input_stmts } store_nodes = { stmt.name: tuple(StoreNode(var=stmt.name, bank=bank) for bank in stmt.dram) for stmt in stencil.output_stmts } for mem_node in itertools.chain(*load_nodes.values()): super_source.add_child(mem_node) for mem_node in itertools.chain(*store_nodes.values()): mem_node.add_child(super_source.super_sink) def color_id(node): if isinstance(node, LoadNode): return f'\033[33mload {node.var}[bank{node.bank}]\033[0m' if isinstance(node, StoreNode): return f'\033[36mstore {node.var}[bank{node.bank}]\033[0m' if isinstance(node, ForwardNode): return f'\033[32mforward {node.tensor.name} @{node.offset}\033[0m' if isinstance(node, ComputeNode): return f'\033[31mcompute {node.tensor.name} #{node.pe_id}\033[0m' return 'unknown node' def color_attr(node): result = [] for k, v in node.__dict__.items(): if (node.__class__, k) in ((SuperSourceNode, 'parents'), (SuperSinkNode, 'children')): continue if k in ('parents', 'children'): result.append('%s: [%s]' % (k, ', '.join(map(color_id, v)))) else: result.append('%s: %s' % (k, repr(v))) return '{%s}' % ', '.join(result) def color_print(node): return '%s: %s' % (color_id(node), color_attr(node)) print_node = color_id if stencil.replication_factor > 1: replicated_next_fifo = stencil.get_replicated_next_fifo() replicated_all_points = stencil.get_replicated_all_points() replicated_reuse_buffers = stencil.get_replicated_reuse_buffers() def add_fwd_nodes(src_name): dsts = replicated_all_points[src_name] reuse_buffer = replicated_reuse_buffers[src_name][1:] nodes_to_add = [] for dst_point_dicts in dsts.values(): for offset in dst_point_dicts: if (src_name, offset) in super_source.fwd_nodes: continue fwd_node = ForwardNode( tensor=stencil.tensors[src_name], offset=offset, depth=stencil.get_replicated_reuse_buffer_length( src_name, offset)) _logger.debug('create %s', print_node(fwd_node)) init_offsets = [ start for start, end in reuse_buffer if start == end ] if offset in init_offsets: if src_name in [stencil.input.name]: load_node_count = len(load_nodes[src_name]) load_nodes[src_name][ load_node_count - 1 - offset % load_node_count].add_child(fwd_node) else: (super_source.cpt_nodes[(src_name, 0)].add_child(fwd_node)) super_source.fwd_nodes[(src_name, offset)] = fwd_node if offset in replicated_next_fifo[src_name]: nodes_to_add.append( (fwd_node, (src_name, replicated_next_fifo[src_name][offset]))) for src_node, key in nodes_to_add: src_node.add_child(super_source.fwd_nodes[key]) add_fwd_nodes(stencil.input.name) for stage in stencil.get_stages_chronologically(): cpt_node = ComputeNode(stage=stage, pe_id=0) _logger.debug('create %s', print_node(cpt_node)) super_source.cpt_nodes[(stage.name, 0)] = cpt_node for input_name, input_window in stage.window.items(): for i in range(len(input_window)): offset = next(offset for offset, points in ( replicated_all_points[input_name][stage.name].items()) if points == i) fwd_node = super_source.fwd_nodes[(input_name, offset)] _logger.debug(' access %s', print_node(fwd_node)) fwd_node.add_child(cpt_node) if stage.is_output(): super_source.cpt_nodes[stage.name, 0].add_child(store_nodes[stage.name][0]) else: add_fwd_nodes(stage.name) else: next_fifo = stencil.next_fifo all_points = stencil.all_points reuse_buffers = stencil.reuse_buffers def add_fwd_nodes(src_name): dsts = all_points[src_name] reuse_buffer = reuse_buffers[src_name][1:] nodes_to_add = [] for dst_point_dicts in dsts.values(): for offset in dst_point_dicts: if (src_name, offset) in super_source.fwd_nodes: continue fwd_node = ForwardNode(tensor=stencil.tensors[src_name], offset=offset) #depth=stencil.get_reuse_buffer_length(src_name, offset)) _logger.debug('create %s', print_node(fwd_node)) # init_offsets is the start of each reuse chain init_offsets = [ next(end for start, end in reuse_buffer if start == unroll_idx) for unroll_idx in reversed(range(stencil.unroll_factor)) ] _logger.debug('reuse buffer: %s', reuse_buffer) _logger.debug('init offsets: %s', init_offsets) if offset in init_offsets: if src_name in stencil.input_names: # fwd from external input load_node_count = len(load_nodes[src_name]) load_nodes[src_name][ load_node_count - 1 - offset % load_node_count].add_child(fwd_node) else: # fwd from output of last stage # tensor name and offset are used to find the cpt node cpt_offset = next( unroll_idx for unroll_idx in range(stencil.unroll_factor) if init_offsets[unroll_idx] == offset) cpt_node = super_source.cpt_nodes[(src_name, cpt_offset)] cpt_node.add_child(fwd_node) super_source.fwd_nodes[(src_name, offset)] = fwd_node if offset in next_fifo[src_name]: nodes_to_add.append( (fwd_node, (src_name, next_fifo[src_name][offset]))) for src_node, key in nodes_to_add: # fwd from another fwd node src_node.add_child(super_source.fwd_nodes[key]) for input_name in stencil.input_names: add_fwd_nodes(input_name) for tensor in chronological_tensors: if tensor.is_input(): continue for unroll_index in range(stencil.unroll_factor): pe_id = stencil.unroll_factor - 1 - unroll_index cpt_node = ComputeNode(tensor=tensor, pe_id=pe_id) _logger.debug('create %s', print_node(cpt_node)) super_source.cpt_nodes[(tensor.name, pe_id)] = cpt_node for input_name, input_window in tensor.ld_indices.items(): for i in range(len(input_window)): offset = next( offset for offset, points in all_points[input_name][ tensor.name].items() if pe_id in points and points[pe_id] == i) fwd_node = super_source.fwd_nodes[(input_name, offset)] _logger.debug(' access %s', print_node(fwd_node)) fwd_node.add_child(cpt_node) if tensor.is_output(): for pe_id in range(stencil.unroll_factor): super_source.cpt_nodes[tensor.name, pe_id].add_child( store_nodes[tensor.name][pe_id % len( store_nodes[tensor.name])]) else: add_fwd_nodes(tensor.name) # pylint: disable=too-many-nested-blocks for src_node in super_source.tpo_valid_node_gen(): for dst_node in filter(is_valid_node, src_node.children): # 5 possible edge types: # 1. load => fwd # 2. fwd => fwd # 3. fwd => cpt # 4. cpt => fwd # 5. cpt => store if isinstance(src_node, LoadNode): write_lat = 0 elif isinstance(src_node, ForwardNode): write_lat = 2 elif isinstance(src_node, ComputeNode): write_lat = src_node.tensor.st_ref.lat else: raise util.InternalError('unexpected source node: %s' % repr(src_node)) fifo = ir.FIFO(src_node, dst_node, depth=0, write_lat=write_lat) lets: List[ir.Let] = [] if isinstance(src_node, LoadNode): expr = ir.DRAMRef( haoda_type=dst_node.tensor.haoda_type, dram=(src_node.bank, ), var=dst_node.tensor.name, offset=(stencil.unroll_factor - 1 - dst_node.offset) // len(stencil.stmt_table[dst_node.tensor.name].dram), ) elif isinstance(src_node, ForwardNode): if isinstance(dst_node, ComputeNode): dst = src_node.tensor.children[dst_node.tensor.name] src_name = src_node.tensor.name unroll_idx = dst_node.pe_id point = all_points[src_name][dst.name][ src_node.offset][unroll_idx] idx = list(dst.ld_indices[src_name].values())[point].idx _logger.debug( '%s%s referenced by <%s> @ unroll_idx=%d is %s', src_name, util.idx2str(idx), dst.name, unroll_idx, print_node(src_node)) dst_node.fifo_map[src_name][idx] = fifo delay = stencil.reuse_buffer_lengths[src_node.tensor.name]\ [src_node.offset] offset = src_node.offset - delay for parent in src_node.parents: # fwd node has only 1 parent for fifo_r in parent.fifos: if fifo_r.edge == (parent, src_node): break if delay > 0: # TODO: build an index somewhere for let in src_node.lets: # pylint: disable=undefined-loop-variable if isinstance( let.expr, ir.DelayedRef) and let.expr.ref == fifo_r: var_name = let.name var_type = let.haoda_type break else: var_name = 'let_%d' % len(src_node.lets) # pylint: disable=undefined-loop-variable var_type = fifo_r.haoda_type lets.append( ir.Let(haoda_type=var_type, name=var_name, expr=ir.DelayedRef(delay=delay, ref=fifo_r))) expr = ir.Var(name=var_name, idx=[]) expr.haoda_type = var_type else: expr = fifo_r # pylint: disable=undefined-loop-variable elif isinstance(src_node, ComputeNode): def replace_refs_callback(obj, args): if isinstance(obj, ir.Ref): _logger.debug( 'replace %s with %s', obj, # pylint: disable=cell-var-from-loop,undefined-loop-variable src_node.fifo_map[obj.name][obj.idx]) # pylint: disable=cell-var-from-loop,undefined-loop-variable return src_node.fifo_map[obj.name][obj.idx] return obj _logger.debug('lets: %s', src_node.tensor.lets) lets = [ _.visit(replace_refs_callback) for _ in src_node.tensor.lets ] _logger.debug('replaced lets: %s', lets) _logger.debug('expr: %s', src_node.tensor.expr) expr = src_node.tensor.expr.visit(replace_refs_callback) _logger.debug('replaced expr: %s', expr) if isinstance(dst_node, StoreNode): dram_ref = ir.DRAMRef( haoda_type=src_node.tensor.haoda_type, dram=(dst_node.bank, ), var=src_node.tensor.name, offset=(src_node.pe_id) // len(stencil.stmt_table[src_node.tensor.name].dram), ) dst_node.lets.append( ir.Let(haoda_type=None, name=dram_ref, expr=fifo)) else: raise util.InternalError('unexpected node of type %s' % type(src_node)) src_node.exprs[fifo] = expr src_node.lets.extend(_ for _ in lets if _ not in src_node.lets) _logger.debug( 'fifo [%d]: %s%s => %s', fifo.depth, color_id(src_node), '' if fifo.write_lat is None else ' ~%d' % fifo.write_lat, color_id(dst_node)) super_source.update_module_depths({}) return super_source
def chronological_tensors(self): """Computes the offsets of tensors. Returns: A list of Tensor, in chronological order. """ _logger.info('calculate tensor offsets') processing_queue = collections.deque(list(self.input_names)) processed_tensors = set(self.input_names) chronological_tensors = list(map(self.tensors.get, self.input_names)) for tensor in chronological_tensors: _logger.debug('tensor <%s> is at offset %d' % (tensor.name, tensor.st_offset)) _logger.debug('processing queue: %s', processing_queue) _logger.debug('processed_tensors: %s', processed_tensors) while processing_queue: tensor = self.tensors[processing_queue.popleft()] _logger.debug('inspecting tensor %s\'s children' % tensor.name) for child in tensor.children.values(): if ({x.name for x in child.parents.values()} <= processed_tensors and child.name not in processed_tensors): # good, all inputs are processed # can determine offset of current tensor _logger.debug( 'input%s for tensor <%s> (i.e. %s) %s processed', '' if len(child.parents) == 1 else 's', child.name, ', '.join([x.name for x in child.parents.values()]), 'is' if len(child.parents) == 1 else 'are') stage_offset = soda_util.serialize(child.st_idx, self.tile_size) # synchronization check def sync(tensor, offset): if tensor is None: return offset _logger.debug('index of tensor <%s>: %s', tensor.name, tensor.st_idx) stage_offset = soda_util.serialize( tensor.st_idx, self.tile_size) _logger.debug('offset of tensor <%s>: %d', tensor.name, stage_offset) loads = visitor.get_load_dict(tensor) for name in loads: loads[name] = tuple(ref.idx for ref in loads[name]) _logger.debug( 'loads: %s', ', '.join( '%s@%s' % (name, util.lst2str(map(util.idx2str, indices))) for name, indices in loads.items())) for n in loads: loads[n] = soda_util.serialize_iter( loads[n], self.tile_size) for l in loads.values(): l[0], l[-1] = (stage_offset - max(l), stage_offset - min(l)) del l[1:-1] if len(l) == 1: l.append(l[-1]) _logger.debug( 'load offset range in tensor %s: %s', tensor.name, '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v) for n, v in loads.items()))) for parent in tensor.parents.values(): tensor_distance = next( reversed(tensor.ld_offsets[parent.name])) _logger.debug('tensor distance: %s', tensor_distance) _logger.debug( 'want to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) tensor_offset = (parent.st_delay + tensor_distance - stage_offset) if offset < tensor_offset: _logger.debug( 'but tensor <%s> won\'t be available until offset %d', parent.name, tensor_offset) offset = tensor_offset _logger.debug( 'need to access tensor <%s> at offset [%d, %d] ' 'to generate tensor <%s> at offset %d', parent.name, offset + loads[parent.name][0], offset + loads[parent.name][-1], tensor.name, offset) return offset _logger.debug( 'intend to generate tensor <%s> at offset %d', child.name, child.st_delay) synced_offset = sync(child, child.st_delay) _logger.debug('synced offset: %s', synced_offset) child.st_delay = synced_offset _logger.debug( 'decide to generate tensor <%s> at offset %d', child.name, child.st_delay) # add delay for sibling in child.parents.values(): delay = child.st_delay - (sibling.st_delay + list( child.ld_offsets[sibling.name].keys())[-1] - stage_offset) if delay > 0: _logger.debug( 'tensor %s arrives at tensor <%s> at offset %d < %d; ' 'add %d delay', sibling.name, child.name, sibling.st_delay + next( reversed(child.ld_offsets[sibling.name])) - stage_offset, child.st_delay, delay) else: _logger.debug( 'tensor %s arrives at tensor <%s> at offset %d = %d; good', sibling.name, child.name, sibling.st_delay + next( reversed(child.ld_offsets[sibling.name])) - stage_offset, child.st_delay) child.ld_delays[sibling.name] = max(delay, 0) _logger.debug('set delay of |%s <- %s| to %d' % (child.name, sibling.name, child.ld_delays[sibling.name])) processing_queue.append(child.name) processed_tensors.add(child.name) chronological_tensors.append(child) else: for parent in tensor.parents.values(): if parent.name not in processed_tensors: _logger.debug( 'tensor %s requires tensor <%s> as an input', tensor.name, parent.name) _logger.debug( 'but tensor <%s> isn\'t processed yet', parent.name) _logger.debug('add %s to scheduling queue', parent.name) processing_queue.append(parent.name) _logger.debug('tensors in insertion order: [%s]', ', '.join(map(str, self.tensors))) _logger.debug('tensors in chronological order: [%s]', ', '.join(t.name for t in chronological_tensors)) for tensor in self.tensors.values(): for name, indices in tensor.ld_indices.items(): _logger.debug( 'stage index: %s@%s <- %s@%s', tensor.name, util.idx2str(tensor.st_idx), name, util.lst2str(util.idx2str(idx) for idx in indices)) for tensor in self.tensors.values(): if tensor.is_input(): continue _logger.debug('stage expr: %s = %s', tensor.st_ref, tensor.expr) for tensor in self.tensors.values(): for name, offsets in tensor.ld_offsets.items(): _logger.debug( 'stage offset: %s@%d <- %s@%s', tensor.name, soda_util.serialize(tensor.st_idx, self.tile_size), name, util.lst2str(offsets)) for tensor in self.tensors.values(): for name, delay in tensor.ld_delays.items(): _logger.debug('stage delay: %s <- %s delayed %d' % (tensor.name, name, delay)) return chronological_tensors