def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] loop_type = list(set([sdfg.arrays[a].dtype for a in sdfg.arrays]))[0] ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = { 1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type }[ltype_size] callsite_stream.write('{') # Define all input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition( sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) # We only create an SVE do-while in the innermost loop for param, rng in zip(entry_node.map.params, entry_node.map.range): begin, end, stride = (sym2cpp(r) for r in rng) self.dispatcher.defined_vars.enter_scope(sdfg) # Check whether we are in the innermost loop if param != entry_node.map.params[-1]: # Default C++ for-loop callsite_stream.write( f'for(auto {param} = {begin}; {param} <= {end}; {param} += {stride}) {{' ) else: # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write( f'''{self.counter_type} {param} = {begin}; svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, ({self.counter_type}) {end}); do {{''', sdfg, state_id, entry_node) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Close the loops from above (in reverse) for param, rng in zip(reversed(entry_node.map.params), reversed(entry_node.map.range)): # The innermost loop is SVE and needs a special while-footer, otherwise we just add the closing bracket if param != entry_node.map.params[-1]: # Close the default C++ for-loop callsite_stream.write('}') else: # Generate the SVE loop footer _, end, stride = (sym2cpp(r) for r in rng) # Increase the counting variable (according to the number of processed elements) # Then recompute the loop predicate and test for it callsite_stream.write( f'''{param} += svcntp_b{ltype_size * 8}(__pg_{param}, __pg_{param}) * {stride}; __pg_{param} = svwhilele_b{ltype_size * 8}({param}, ({self.counter_type}) {end}); }} while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param}));''', sdfg, state_id, entry_node) self.dispatcher.defined_vars.exit_scope(sdfg) callsite_stream.write('}')
def concurrent_subgraphs(graph): """ Finds subgraphs of an SDFGState or ScopeSubgraphView that can run concurrently. """ from dace.sdfg.scope import ScopeSubgraphView if not isinstance(graph, (SDFGState, ScopeSubgraphView)): raise TypeError( "Expected SDFGState or ScopeSubgraphView, got: {}".format( type(graph).__name__)) candidates = graph.source_nodes() components = collections.OrderedDict() # {start node: nodes in component} for cand in candidates: if isinstance(cand, nd.AccessNode): # AccessNodes can be read from multiple concurrent components, so # check all out edges start_nodes = [e.dst for e in graph.out_edges(cand)] for n in start_nodes: if n not in components: components[n] = {cand, n} else: # Components can read from multiple start arrays components[n].add(cand) else: # The source node == the first control or compute node components[cand] = {cand} subgraphs = [] # [{nodes in subgraph}] for i, start_node in enumerate(components): # Do BFS and find all nodes reachable from this start node seen = set() to_search = [start_node] while len(to_search) > 0: node = to_search.pop() if node in seen: continue seen.add(node) for e in graph.out_edges(node): if e.dst not in seen: to_search.append(e.dst) # If this component overlaps with any previously determined components, # fuse them to_delete = [] for i, other in enumerate(subgraphs): if len(other & seen) > 0: to_delete.append(i) if len(to_delete) == 0: # If there was no overlap, this is a concurrent subgraph subgraphs.append(seen | components[start_node]) else: # Merge overlapping subgraphs new_subgraph = seen | components[start_node] for index in reversed(to_delete): new_subgraph |= subgraphs.pop(index) subgraphs.append(new_subgraph) # Now stick each of the found components in a ScopeSubgraphView and return # them. Sort according to original order of nodes all_nodes = graph.nodes() return [ ScopeSubgraphView(graph, [n for n in all_nodes if n in sg], None) for sg in subgraphs ]
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] current_map = entry_node.map self.current_map = current_map if len(current_map.params) > 1: raise util.NotSupportedError('SVE map must be one dimensional') loop_types = list(set([util.get_base_type(sdfg.arrays[a].dtype) for a in sdfg.arrays])) # Edge case if no arrays are used loop_type = loop_types[0] if len(loop_types) > 0 else dace.int64 ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = {1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type}[ltype_size] callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(scope) # Define all dynamic input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) param = current_map.params[0] rng = current_map.range[0] begin, end, stride = (sym2cpp(r) for r in rng) # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write(f'{self.counter_type} {param} = {begin};') end_param = f'__{param}_to' callsite_stream.write(f'{self.counter_type} {end_param} = {end};') callsite_stream.write(f'svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') # Test for the predicate callsite_stream.write(f'while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param})) {{') # Allocate scope related memory for node, _ in scope.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): # Create empty shared registers for outputs into other tasklets for edge in state_dfg.out_edges(node): if isinstance(edge.dst, dace.nodes.Tasklet): self.generate_out_register(sdfg, state_dfg, edge, callsite_stream, True) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Increase the counting variable (according to the number of processed elements) size_letter = {1: 'b', 2: 'h', 4: 'w', 8: 'd'}[ltype_size] callsite_stream.write(f'{param} += svcnt{size_letter}() * {stride};') # Then recompute the loop predicate callsite_stream.write(f'__pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') callsite_stream.write('}') self.dispatcher.defined_vars.exit_scope(scope) callsite_stream.write('}')