예제 #1
0
파일: tensor.py 프로젝트: UCLA-VAST/soda
   def __str__(self):
       return '''Tensor
 {haoda_type}: {name} = {expr}
 store: {st_ref}
 parents: {parents}
 children: {children}'''.format(name=self.name,
                                haoda_type=self.haoda_type,
                                expr=self.expr,
                                parents=util.idx2str(self.parents),
                                children=util.idx2str(self.children),
                                st_ref=str(self.st_ref))
예제 #2
0
def _get_points(tile_size, tensor, unroll_factor):
    """Generates offset-to-point mapping for a Tensor.

  Generates a mapping which can be used to determine the accessed point index
  from the offset for a Tensor, under the given tile size and unroll factor.

  Args:
    tile_size: An iterable representing the tile size in each dimension.
    tensor: A Tensor to which the mapping belongs.
    unroll_factor: An int representing the unroll factor.

  Returns:
    A dict of name str to a dict of offset to a dict of unroll index to
    point index.
  """

    all_points = {}  # {name: {offset: {unroll_idx: point_idx}}}
    for child in tensor.children.values():
        all_points[child.name] = {}
        offsets = child.ld_offsets[tensor.name]
        for unroll_idx in range(unroll_factor):
            for idx, offset in enumerate(offsets):
                all_points[child.name].setdefault(
                    max(offsets) - offset + child.ld_delays[tensor.name] +
                    unroll_idx, {})[unroll_factor - 1 - unroll_idx] = idx
    for child in tensor.children.values():
        for offset, points in all_points[child.name].items():
            for unroll_idx, point in points.items():
                _logger.debug(
                    '%s <- %s @ offset=%d <=> %s @ unroll_idx=%d', child.name,
                    tensor.name, offset,
                    util.idx2str(
                        list(child.ld_indices[tensor.name].values())
                        [point].idx), unroll_idx)
    return all_points
예제 #3
0
def get_overall_stencil_window(input_tensor, output_tensor):
    if isinstance(input_tensor, collections.Iterable):
        all_points = tuple(
            sorted(
                set.union(*(set(get_overall_stencil_window(_, output_tensor))
                            for _ in input_tensor))))
        _logger.debug(
            'overall stencil window of %s (%s) <- {%s} is %s (%d points)',
            output_tensor.name, ', '.join(['0'] * len(output_tensor.st_idx)),
            ', '.join(_.name
                      for _ in input_tensor), all_points, len(all_points))
        return all_points
    # normalize store index to 0
    idx = (id(input_tensor), id(output_tensor))
    if idx in _overall_stencil_window_cache:
        return _overall_stencil_window_cache[idx]
    _logger.debug('get overall stencil window of %s <- %s', output_tensor.name,
                  input_tensor.name)
    all_points = set()
    for name, points in output_tensor.ld_indices.items():
        _logger.debug('%s@%s <- %s', output_tensor.name,
                      util.idx2str(output_tensor.st_idx),
                      util.idx2str(points.values()))
        if name != input_tensor.name:
            recursive_points = get_overall_stencil_window(
                input_tensor, output_tensor.parents[name])
            _logger.debug('recursive points: %s',
                          util.idx2str(recursive_points))
            all_points |= set.union(*[{
                tuple(
                    map(lambda a, b, c: a + b - c, _, point,
                        output_tensor.st_idx))
                for _ in recursive_points
            } for point in points])
        else:
            all_points |= {
                tuple(map(operator.sub, point, output_tensor.st_idx))
                for point in points
            }
    all_points = tuple(sorted(all_points))
    _logger.debug('overall stencil window of %s (%s) <- %s is %s (%d points)',
                  output_tensor.name,
                  ', '.join(['0'] * len(output_tensor.st_idx)),
                  input_tensor.name, all_points, len(all_points))
    _overall_stencil_window_cache[idx] = all_points
    return all_points
예제 #4
0
파일: core.py 프로젝트: Ruola/haoda
 def __repr__(self):
     return '%s(loads: %s, lets: %s, exprs: %s)' % (
         type(self).__name__, util.idx2str(
             self.loads), util.idx2str(self.lets), util.idx2str(self.exprs))
예제 #5
0
파일: dataflow.py 프로젝트: UCLA-VAST/soda
def create_dataflow_graph(stencil):
    chronological_tensors = stencil.chronological_tensors
    super_source = SuperSourceNode(
        fwd_nodes={},
        cpt_nodes={},
        super_sink=SuperSinkNode(),
    )

    load_nodes = {
        stmt.name:
        tuple(LoadNode(var=stmt.name, bank=bank) for bank in stmt.dram)
        for stmt in stencil.input_stmts
    }
    store_nodes = {
        stmt.name:
        tuple(StoreNode(var=stmt.name, bank=bank) for bank in stmt.dram)
        for stmt in stencil.output_stmts
    }

    for mem_node in itertools.chain(*load_nodes.values()):
        super_source.add_child(mem_node)
    for mem_node in itertools.chain(*store_nodes.values()):
        mem_node.add_child(super_source.super_sink)

    def color_id(node):
        if isinstance(node, LoadNode):
            return f'\033[33mload {node.var}[bank{node.bank}]\033[0m'
        if isinstance(node, StoreNode):
            return f'\033[36mstore {node.var}[bank{node.bank}]\033[0m'
        if isinstance(node, ForwardNode):
            return f'\033[32mforward {node.tensor.name} @{node.offset}\033[0m'
        if isinstance(node, ComputeNode):
            return f'\033[31mcompute {node.tensor.name} #{node.pe_id}\033[0m'
        return 'unknown node'

    def color_attr(node):
        result = []
        for k, v in node.__dict__.items():
            if (node.__class__, k) in ((SuperSourceNode, 'parents'),
                                       (SuperSinkNode, 'children')):
                continue
            if k in ('parents', 'children'):
                result.append('%s: [%s]' % (k, ', '.join(map(color_id, v))))
            else:
                result.append('%s: %s' % (k, repr(v)))
        return '{%s}' % ', '.join(result)

    def color_print(node):
        return '%s: %s' % (color_id(node), color_attr(node))

    print_node = color_id

    if stencil.replication_factor > 1:
        replicated_next_fifo = stencil.get_replicated_next_fifo()
        replicated_all_points = stencil.get_replicated_all_points()
        replicated_reuse_buffers = stencil.get_replicated_reuse_buffers()

        def add_fwd_nodes(src_name):
            dsts = replicated_all_points[src_name]
            reuse_buffer = replicated_reuse_buffers[src_name][1:]
            nodes_to_add = []
            for dst_point_dicts in dsts.values():
                for offset in dst_point_dicts:
                    if (src_name, offset) in super_source.fwd_nodes:
                        continue
                    fwd_node = ForwardNode(
                        tensor=stencil.tensors[src_name],
                        offset=offset,
                        depth=stencil.get_replicated_reuse_buffer_length(
                            src_name, offset))
                    _logger.debug('create %s', print_node(fwd_node))
                    init_offsets = [
                        start for start, end in reuse_buffer if start == end
                    ]
                    if offset in init_offsets:
                        if src_name in [stencil.input.name]:
                            load_node_count = len(load_nodes[src_name])
                            load_nodes[src_name][
                                load_node_count - 1 -
                                offset % load_node_count].add_child(fwd_node)
                        else:
                            (super_source.cpt_nodes[(src_name,
                                                     0)].add_child(fwd_node))
                    super_source.fwd_nodes[(src_name, offset)] = fwd_node
                    if offset in replicated_next_fifo[src_name]:
                        nodes_to_add.append(
                            (fwd_node,
                             (src_name,
                              replicated_next_fifo[src_name][offset])))
            for src_node, key in nodes_to_add:
                src_node.add_child(super_source.fwd_nodes[key])

        add_fwd_nodes(stencil.input.name)

        for stage in stencil.get_stages_chronologically():
            cpt_node = ComputeNode(stage=stage, pe_id=0)
            _logger.debug('create %s', print_node(cpt_node))
            super_source.cpt_nodes[(stage.name, 0)] = cpt_node
            for input_name, input_window in stage.window.items():
                for i in range(len(input_window)):
                    offset = next(offset for offset, points in (
                        replicated_all_points[input_name][stage.name].items())
                                  if points == i)
                    fwd_node = super_source.fwd_nodes[(input_name, offset)]
                    _logger.debug('  access %s', print_node(fwd_node))
                    fwd_node.add_child(cpt_node)
            if stage.is_output():
                super_source.cpt_nodes[stage.name,
                                       0].add_child(store_nodes[stage.name][0])
            else:
                add_fwd_nodes(stage.name)

    else:
        next_fifo = stencil.next_fifo
        all_points = stencil.all_points
        reuse_buffers = stencil.reuse_buffers

        def add_fwd_nodes(src_name):
            dsts = all_points[src_name]
            reuse_buffer = reuse_buffers[src_name][1:]
            nodes_to_add = []
            for dst_point_dicts in dsts.values():
                for offset in dst_point_dicts:
                    if (src_name, offset) in super_source.fwd_nodes:
                        continue
                    fwd_node = ForwardNode(tensor=stencil.tensors[src_name],
                                           offset=offset)
                    #depth=stencil.get_reuse_buffer_length(src_name, offset))
                    _logger.debug('create %s', print_node(fwd_node))
                    # init_offsets is the start of each reuse chain
                    init_offsets = [
                        next(end for start, end in reuse_buffer
                             if start == unroll_idx) for unroll_idx in
                        reversed(range(stencil.unroll_factor))
                    ]
                    _logger.debug('reuse buffer: %s', reuse_buffer)
                    _logger.debug('init offsets: %s', init_offsets)
                    if offset in init_offsets:
                        if src_name in stencil.input_names:
                            # fwd from external input
                            load_node_count = len(load_nodes[src_name])
                            load_nodes[src_name][
                                load_node_count - 1 -
                                offset % load_node_count].add_child(fwd_node)
                        else:
                            # fwd from output of last stage
                            # tensor name and offset are used to find the cpt node
                            cpt_offset = next(
                                unroll_idx
                                for unroll_idx in range(stencil.unroll_factor)
                                if init_offsets[unroll_idx] == offset)
                            cpt_node = super_source.cpt_nodes[(src_name,
                                                               cpt_offset)]
                            cpt_node.add_child(fwd_node)
                    super_source.fwd_nodes[(src_name, offset)] = fwd_node
                    if offset in next_fifo[src_name]:
                        nodes_to_add.append(
                            (fwd_node, (src_name,
                                        next_fifo[src_name][offset])))
            for src_node, key in nodes_to_add:
                # fwd from another fwd node
                src_node.add_child(super_source.fwd_nodes[key])

        for input_name in stencil.input_names:
            add_fwd_nodes(input_name)

        for tensor in chronological_tensors:
            if tensor.is_input():
                continue
            for unroll_index in range(stencil.unroll_factor):
                pe_id = stencil.unroll_factor - 1 - unroll_index
                cpt_node = ComputeNode(tensor=tensor, pe_id=pe_id)
                _logger.debug('create %s', print_node(cpt_node))
                super_source.cpt_nodes[(tensor.name, pe_id)] = cpt_node
                for input_name, input_window in tensor.ld_indices.items():
                    for i in range(len(input_window)):
                        offset = next(
                            offset
                            for offset, points in all_points[input_name][
                                tensor.name].items()
                            if pe_id in points and points[pe_id] == i)
                        fwd_node = super_source.fwd_nodes[(input_name, offset)]
                        _logger.debug('  access %s', print_node(fwd_node))
                        fwd_node.add_child(cpt_node)
            if tensor.is_output():
                for pe_id in range(stencil.unroll_factor):
                    super_source.cpt_nodes[tensor.name, pe_id].add_child(
                        store_nodes[tensor.name][pe_id % len(
                            store_nodes[tensor.name])])
            else:
                add_fwd_nodes(tensor.name)

    # pylint: disable=too-many-nested-blocks
    for src_node in super_source.tpo_valid_node_gen():
        for dst_node in filter(is_valid_node, src_node.children):
            # 5 possible edge types:
            # 1. load => fwd
            # 2. fwd => fwd
            # 3. fwd => cpt
            # 4. cpt => fwd
            # 5. cpt => store
            if isinstance(src_node, LoadNode):
                write_lat = 0
            elif isinstance(src_node, ForwardNode):
                write_lat = 2
            elif isinstance(src_node, ComputeNode):
                write_lat = src_node.tensor.st_ref.lat
            else:
                raise util.InternalError('unexpected source node: %s' %
                                         repr(src_node))

            fifo = ir.FIFO(src_node, dst_node, depth=0, write_lat=write_lat)
            lets: List[ir.Let] = []
            if isinstance(src_node, LoadNode):
                expr = ir.DRAMRef(
                    haoda_type=dst_node.tensor.haoda_type,
                    dram=(src_node.bank, ),
                    var=dst_node.tensor.name,
                    offset=(stencil.unroll_factor - 1 - dst_node.offset) //
                    len(stencil.stmt_table[dst_node.tensor.name].dram),
                )
            elif isinstance(src_node, ForwardNode):
                if isinstance(dst_node, ComputeNode):
                    dst = src_node.tensor.children[dst_node.tensor.name]
                    src_name = src_node.tensor.name
                    unroll_idx = dst_node.pe_id
                    point = all_points[src_name][dst.name][
                        src_node.offset][unroll_idx]
                    idx = list(dst.ld_indices[src_name].values())[point].idx
                    _logger.debug(
                        '%s%s referenced by <%s> @ unroll_idx=%d is %s',
                        src_name, util.idx2str(idx), dst.name, unroll_idx,
                        print_node(src_node))
                    dst_node.fifo_map[src_name][idx] = fifo
                delay = stencil.reuse_buffer_lengths[src_node.tensor.name]\
                                                    [src_node.offset]
                offset = src_node.offset - delay
                for parent in src_node.parents:  # fwd node has only 1 parent
                    for fifo_r in parent.fifos:
                        if fifo_r.edge == (parent, src_node):
                            break
                if delay > 0:
                    # TODO: build an index somewhere
                    for let in src_node.lets:
                        # pylint: disable=undefined-loop-variable
                        if isinstance(
                                let.expr,
                                ir.DelayedRef) and let.expr.ref == fifo_r:
                            var_name = let.name
                            var_type = let.haoda_type
                            break
                    else:
                        var_name = 'let_%d' % len(src_node.lets)
                        # pylint: disable=undefined-loop-variable
                        var_type = fifo_r.haoda_type
                        lets.append(
                            ir.Let(haoda_type=var_type,
                                   name=var_name,
                                   expr=ir.DelayedRef(delay=delay,
                                                      ref=fifo_r)))
                    expr = ir.Var(name=var_name, idx=[])
                    expr.haoda_type = var_type
                else:
                    expr = fifo_r  # pylint: disable=undefined-loop-variable
            elif isinstance(src_node, ComputeNode):

                def replace_refs_callback(obj, args):
                    if isinstance(obj, ir.Ref):
                        _logger.debug(
                            'replace %s with %s',
                            obj,
                            # pylint: disable=cell-var-from-loop,undefined-loop-variable
                            src_node.fifo_map[obj.name][obj.idx])
                        # pylint: disable=cell-var-from-loop,undefined-loop-variable
                        return src_node.fifo_map[obj.name][obj.idx]
                    return obj

                _logger.debug('lets: %s', src_node.tensor.lets)
                lets = [
                    _.visit(replace_refs_callback)
                    for _ in src_node.tensor.lets
                ]
                _logger.debug('replaced lets: %s', lets)
                _logger.debug('expr: %s', src_node.tensor.expr)
                expr = src_node.tensor.expr.visit(replace_refs_callback)
                _logger.debug('replaced expr: %s', expr)
                if isinstance(dst_node, StoreNode):
                    dram_ref = ir.DRAMRef(
                        haoda_type=src_node.tensor.haoda_type,
                        dram=(dst_node.bank, ),
                        var=src_node.tensor.name,
                        offset=(src_node.pe_id) //
                        len(stencil.stmt_table[src_node.tensor.name].dram),
                    )
                    dst_node.lets.append(
                        ir.Let(haoda_type=None, name=dram_ref, expr=fifo))
            else:
                raise util.InternalError('unexpected node of type %s' %
                                         type(src_node))

            src_node.exprs[fifo] = expr
            src_node.lets.extend(_ for _ in lets if _ not in src_node.lets)
            _logger.debug(
                'fifo [%d]: %s%s => %s', fifo.depth, color_id(src_node),
                '' if fifo.write_lat is None else ' ~%d' % fifo.write_lat,
                color_id(dst_node))

    super_source.update_module_depths({})

    return super_source
예제 #6
0
    def chronological_tensors(self):
        """Computes the offsets of tensors.

    Returns:
      A list of Tensor, in chronological order.
    """
        _logger.info('calculate tensor offsets')
        processing_queue = collections.deque(list(self.input_names))
        processed_tensors = set(self.input_names)
        chronological_tensors = list(map(self.tensors.get, self.input_names))
        for tensor in chronological_tensors:
            _logger.debug('tensor <%s> is at offset %d' %
                          (tensor.name, tensor.st_offset))
        _logger.debug('processing queue: %s', processing_queue)
        _logger.debug('processed_tensors: %s', processed_tensors)
        while processing_queue:
            tensor = self.tensors[processing_queue.popleft()]
            _logger.debug('inspecting tensor %s\'s children' % tensor.name)
            for child in tensor.children.values():
                if ({x.name
                     for x in child.parents.values()} <= processed_tensors
                        and child.name not in processed_tensors):
                    # good, all inputs are processed
                    # can determine offset of current tensor
                    _logger.debug(
                        'input%s for tensor <%s> (i.e. %s) %s processed',
                        '' if len(child.parents) == 1 else 's', child.name,
                        ', '.join([x.name for x in child.parents.values()]),
                        'is' if len(child.parents) == 1 else 'are')
                    stage_offset = soda_util.serialize(child.st_idx,
                                                       self.tile_size)

                    # synchronization check
                    def sync(tensor, offset):
                        if tensor is None:
                            return offset
                        _logger.debug('index of tensor <%s>: %s', tensor.name,
                                      tensor.st_idx)
                        stage_offset = soda_util.serialize(
                            tensor.st_idx, self.tile_size)
                        _logger.debug('offset of tensor <%s>: %d', tensor.name,
                                      stage_offset)
                        loads = visitor.get_load_dict(tensor)
                        for name in loads:
                            loads[name] = tuple(ref.idx for ref in loads[name])
                        _logger.debug(
                            'loads: %s', ', '.join(
                                '%s@%s' %
                                (name,
                                 util.lst2str(map(util.idx2str, indices)))
                                for name, indices in loads.items()))
                        for n in loads:
                            loads[n] = soda_util.serialize_iter(
                                loads[n], self.tile_size)
                        for l in loads.values():
                            l[0], l[-1] = (stage_offset - max(l),
                                           stage_offset - min(l))
                            del l[1:-1]
                            if len(l) == 1:
                                l.append(l[-1])
                        _logger.debug(
                            'load offset range in tensor %s: %s', tensor.name,
                            '{%s}' % (', '.join('%s: [%d:%d]' % (n, *v)
                                                for n, v in loads.items())))
                        for parent in tensor.parents.values():
                            tensor_distance = next(
                                reversed(tensor.ld_offsets[parent.name]))
                            _logger.debug('tensor distance: %s',
                                          tensor_distance)
                            _logger.debug(
                                'want to access tensor <%s> at offset [%d, %d] '
                                'to generate tensor <%s> at offset %d',
                                parent.name, offset + loads[parent.name][0],
                                offset + loads[parent.name][-1], tensor.name,
                                offset)
                            tensor_offset = (parent.st_delay +
                                             tensor_distance - stage_offset)
                            if offset < tensor_offset:
                                _logger.debug(
                                    'but tensor <%s> won\'t be available until offset %d',
                                    parent.name, tensor_offset)
                                offset = tensor_offset
                                _logger.debug(
                                    'need to access tensor <%s> at offset [%d, %d] '
                                    'to generate tensor <%s> at offset %d',
                                    parent.name,
                                    offset + loads[parent.name][0],
                                    offset + loads[parent.name][-1],
                                    tensor.name, offset)
                        return offset

                    _logger.debug(
                        'intend to generate tensor <%s> at offset %d',
                        child.name, child.st_delay)
                    synced_offset = sync(child, child.st_delay)
                    _logger.debug('synced offset: %s', synced_offset)
                    child.st_delay = synced_offset
                    _logger.debug(
                        'decide to generate tensor <%s> at offset %d',
                        child.name, child.st_delay)

                    # add delay
                    for sibling in child.parents.values():
                        delay = child.st_delay - (sibling.st_delay + list(
                            child.ld_offsets[sibling.name].keys())[-1] -
                                                  stage_offset)
                        if delay > 0:
                            _logger.debug(
                                'tensor %s arrives at tensor <%s> at offset %d < %d; '
                                'add %d delay', sibling.name, child.name,
                                sibling.st_delay + next(
                                    reversed(child.ld_offsets[sibling.name])) -
                                stage_offset, child.st_delay, delay)
                        else:
                            _logger.debug(
                                'tensor %s arrives at tensor <%s> at offset %d = %d; good',
                                sibling.name, child.name,
                                sibling.st_delay + next(
                                    reversed(child.ld_offsets[sibling.name])) -
                                stage_offset, child.st_delay)
                        child.ld_delays[sibling.name] = max(delay, 0)
                        _logger.debug('set delay of |%s <- %s| to %d' %
                                      (child.name, sibling.name,
                                       child.ld_delays[sibling.name]))

                    processing_queue.append(child.name)
                    processed_tensors.add(child.name)
                    chronological_tensors.append(child)
                else:
                    for parent in tensor.parents.values():
                        if parent.name not in processed_tensors:
                            _logger.debug(
                                'tensor %s requires tensor <%s> as an input',
                                tensor.name, parent.name)
                            _logger.debug(
                                'but tensor <%s> isn\'t processed yet',
                                parent.name)
                            _logger.debug('add %s to scheduling queue',
                                          parent.name)
                            processing_queue.append(parent.name)

        _logger.debug('tensors in insertion order: [%s]',
                      ', '.join(map(str, self.tensors)))
        _logger.debug('tensors in chronological order: [%s]',
                      ', '.join(t.name for t in chronological_tensors))

        for tensor in self.tensors.values():
            for name, indices in tensor.ld_indices.items():
                _logger.debug(
                    'stage index: %s@%s <- %s@%s', tensor.name,
                    util.idx2str(tensor.st_idx), name,
                    util.lst2str(util.idx2str(idx) for idx in indices))
        for tensor in self.tensors.values():
            if tensor.is_input():
                continue
            _logger.debug('stage expr: %s = %s', tensor.st_ref, tensor.expr)
        for tensor in self.tensors.values():
            for name, offsets in tensor.ld_offsets.items():
                _logger.debug(
                    'stage offset: %s@%d <- %s@%s', tensor.name,
                    soda_util.serialize(tensor.st_idx, self.tile_size), name,
                    util.lst2str(offsets))
        for tensor in self.tensors.values():
            for name, delay in tensor.ld_delays.items():
                _logger.debug('stage delay: %s <- %s delayed %d' %
                              (tensor.name, name, delay))

        return chronological_tensors