def can_be_applied(self, graph, expr_index, sdfg, permissive=False): first_state: SDFGState = self.first_state second_state: SDFGState = self.second_state out_edges = graph.out_edges(first_state) in_edges = graph.in_edges(first_state) # First state must have only one output edge (with dst the second # state). if len(out_edges) != 1: return False # If both states have more than one incoming edge, some control flow # may become ambiguous if len(in_edges) > 1 and graph.in_degree(second_state) > 1: return False # The interstate edge must not have a condition. if not out_edges[0].data.is_unconditional(): return False # The interstate edge may have assignments, as long as there are input # edges to the first state that can absorb them. if out_edges[0].data.assignments: if not in_edges: return False # Fail if symbol is set before the state to fuse new_assignments = set(out_edges[0].data.assignments.keys()) if any((new_assignments & set(e.data.assignments.keys())) for e in in_edges): return False # Fail if symbol is used in the dataflow of that state if len(new_assignments & first_state.free_symbols) > 0: return False # Fail if assignments have free symbols that are updated in the # first state freesyms = out_edges[0].data.free_symbols if freesyms and any(n.data in freesyms for n in first_state.nodes() if isinstance(n, nodes.AccessNode) and first_state.in_degree(n) > 0): return False # Fail if symbols assigned on the first edge are free symbols on the # second edge symbols_used = set(out_edges[0].data.free_symbols) for e in in_edges: if e.data.assignments.keys() & symbols_used: return False # There can be no state that have output edges pointing to both the # first and the second state. Such a case will produce a multi-graph. for src, _, _ in in_edges: for _, dst, _ in graph.out_edges(src): if dst == second_state: return False if not permissive: # Strict mode that inhibits state fusion if Python callbacks are involved if Config.get_bool('frontend', 'dont_fuse_callbacks'): for node in (first_state.data_nodes() + second_state.data_nodes()): if node.data == '__pystate': return False # NOTE: This is quick fix for MPI Waitall (probably also needed for # Wait), until we have a better SDFG representation of the buffer # dependencies. try: from dace.libraries.mpi import Waitall next(node for node in first_state.nodes() if isinstance(node, Waitall) or node.label == '_Waitall_') return False except StopIteration: pass try: from dace.libraries.mpi import Waitall next(node for node in second_state.nodes() if isinstance(node, Waitall) or node.label == '_Waitall_') return False except StopIteration: pass # If second state has other input edges, there might be issues # Exceptions are when none of the states contain dataflow, unless # the first state is an initial state (in which case the new initial # state would be ambiguous). first_in_edges = graph.in_edges(first_state) second_in_edges = graph.in_edges(second_state) if ((not second_state.is_empty() or not first_state.is_empty() or len(first_in_edges) == 0) and len(second_in_edges) != 1): return False # Get connected components. first_cc = [cc_nodes for cc_nodes in nx.weakly_connected_components(first_state._nx)] second_cc = [cc_nodes for cc_nodes in nx.weakly_connected_components(second_state._nx)] # Find source/sink (data) nodes first_input = {node for node in sdutil.find_source_nodes(first_state) if isinstance(node, nodes.AccessNode)} first_output = { node for node in first_state.scope_children()[None] if isinstance(node, nodes.AccessNode) and node not in first_input } second_input = { node for node in sdutil.find_source_nodes(second_state) if isinstance(node, nodes.AccessNode) } second_output = { node for node in second_state.scope_children()[None] if isinstance(node, nodes.AccessNode) and node not in second_input } # Find source/sink (data) nodes by connected component first_cc_input = [cc.intersection(first_input) for cc in first_cc] first_cc_output = [cc.intersection(first_output) for cc in first_cc] second_cc_input = [cc.intersection(second_input) for cc in second_cc] second_cc_output = [cc.intersection(second_output) for cc in second_cc] # Apply transformation in case all paths to the second state's # nodes go through the same access node, which implies sequential # behavior in SDFG semantics. first_output_names = {node.data for node in first_output} second_input_names = {node.data for node in second_input} # If any second input appears more than once, fail if len(second_input) > len(second_input_names): return False # If any first output that is an input to the second state # appears in more than one CC, fail matches = first_output_names & second_input_names for match in matches: cc_appearances = 0 for cc in first_cc_output: if len([n for n in cc if n.data == match]) > 0: cc_appearances += 1 if cc_appearances > 1: return False # Recreate fused connected component correspondences, and then # check for hazards resulting_ccs: List[CCDesc] = StateFusion.find_fused_components(first_cc_input, first_cc_output, second_cc_input, second_cc_output) # Check for data races for fused_cc in resulting_ccs: # Write-Write hazard - data is output of both first and second # states, without a read in between write_write_candidates = ((fused_cc.first_outputs & fused_cc.second_outputs) - fused_cc.second_inputs) # Find the leaf (topological) instances of the matches order = [ x for x in reversed(list(nx.topological_sort(first_state._nx))) if isinstance(x, nodes.AccessNode) and x.data in fused_cc.first_outputs ] # Those nodes will be the connection points upon fusion match_nodes = { next(n for n in order if n.data == match) for match in (fused_cc.first_outputs & fused_cc.second_inputs) } # If we have potential candidates, check if there is a # path from the first write to the second write (in that # case, there is no hazard): for cand in write_write_candidates: nodes_first = [n for n in first_output if n.data == cand] nodes_second = [n for n in second_output if n.data == cand] # If there is a path for the candidate that goes through # the match nodes in both states, there is no conflict fail = False path_found = False for match in match_nodes: for node in nodes_first: path_to = nx.has_path(first_state._nx, node, match) if not path_to: continue path_found = True node2 = next(n for n in second_input if n.data == match.data) if not all(nx.has_path(second_state._nx, node2, n) for n in nodes_second): fail = True break if fail or path_found: break # Check for intersection (if None, fusion is ok) if fail or not path_found: if StateFusion.memlets_intersect(first_state, nodes_first, False, second_state, nodes_second, False): return False # End of write-write hazard check first_inout = fused_cc.first_inputs | fused_cc.first_outputs for other_cc in resulting_ccs: # NOTE: Special handling for `other_cc is fused_cc` if other_cc is fused_cc: # Checking for potential Read-Write data races for d in first_inout: if d in other_cc.second_outputs: nodes_second = [n for n in second_output if n.data == d] # Read-Write race if d in fused_cc.first_inputs: nodes_first = [n for n in first_input if n.data == d] else: nodes_first = [] for n2 in nodes_second: for e in second_state.in_edges(n2): path = second_state.memlet_path(e) src = path[0].src if src in second_input and src.data in fused_cc.first_outputs: for n1 in fused_cc.first_output_nodes: if n1.data == src.data: for n0 in nodes_first: if not nx.has_path(first_state._nx, n0, n1): return False continue # If an input/output of a connected component in the first # state is an output of another connected component in the # second state, we have a potential data race (Read-Write # or Write-Write) for d in first_inout: if d in other_cc.second_outputs: # Check for intersection (if None, fusion is ok) nodes_second = [n for n in second_output if n.data == d] # Read-Write race if d in fused_cc.first_inputs: nodes_first = [n for n in first_input if n.data == d] if StateFusion.memlets_intersect(first_state, nodes_first, True, second_state, nodes_second, False): return False # Write-Write race if d in fused_cc.first_outputs: nodes_first = [n for n in first_output if n.data == d] if StateFusion.memlets_intersect(first_state, nodes_first, False, second_state, nodes_second, False): return False # End of data race check # Read-after-write dependencies: if there is an output of the # second state that is an input of the first, ensure all paths # from the input of the first state lead to the output. # Otherwise, there may be a RAW due to topological sort or # concurrency. second_inout = ((fused_cc.first_inputs | fused_cc.first_outputs) & fused_cc.second_outputs) for inout in second_inout: nodes_first = [n for n in match_nodes if n.data == inout] if any(first_state.out_degree(n) > 0 for n in nodes_first): return False # If we have potential candidates, check if there is a # path from the first read to the second write (in that # case, there is no hazard): nodes_first = { n for n in fused_cc.first_input_nodes | fused_cc.first_output_nodes if n.data == inout } nodes_second = {n for n in fused_cc.second_output_nodes if n.data == inout} # If there is a path for the candidate that goes through # the match nodes in both states, there is no conflict fail = False path_found = False for match in match_nodes: for node in nodes_first: path_to = nx.has_path(first_state._nx, node, match) if not path_to: continue path_found = True node2 = next(n for n in second_input if n.data == match.data) if not all(nx.has_path(second_state._nx, node2, n) for n in nodes_second): fail = True break if fail or path_found: break # Check for intersection (if None, fusion is ok) if fail or not path_found: if StateFusion.memlets_intersect(first_state, nodes_first, True, second_state, nodes_second, False): return False # End of read-write hazard check # Read-after-write dependencies: if there is more than one first # output with the same data, make sure it can be unambiguously # connected to the second state if (len(fused_cc.first_output_nodes) > len(fused_cc.first_outputs)): for inpnode in fused_cc.second_input_nodes: found = None for outnode in fused_cc.first_output_nodes: if outnode.data != inpnode.data: continue if StateFusion.memlets_intersect(first_state, [outnode], False, second_state, [inpnode], True): # If found more than once, either there is a # path from one to another or it is ambiguous if found is not None: if nx.has_path(first_state.nx, outnode, found): # Found is a descendant, continue continue elif nx.has_path(first_state.nx, found, outnode): # New node is a descendant, set as found found = outnode else: # No path: ambiguous match return False found = outnode return True
def generate_code(sdfg) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :return: List of code objects that correspond to files to compile. """ # Before compiling, validate SDFG correctness sdfg.validate() if Config.get_bool('experimental', 'test_serialization'): from dace.sdfg import SDFG import filecmp sdfg.save('test.sdfg') sdfg2 = SDFG.from_file('test.sdfg') sdfg2.save('test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp('test.sdfg', 'test2.sdfg'): raise RuntimeError( 'SDFG serialization failed - files do not match') os.remove('test.sdfg') os.remove('test2.sdfg') # Run with the deserialized version sdfg = sdfg2 frame = framecode.DaCeCodeGenerator() # Instantiate all targets (who register themselves with framecodegen) targets = { name: STRING_TO_TARGET[name](frame, sdfg) for name in _TARGET_REGISTER_ORDER } # Instantiate all instrumentation providers in SDFG frame._dispatcher.instrumentation[ dtypes.InstrumentationType.No_Instrumentation] = None for node, _ in sdfg.all_nodes_recursive(): if hasattr(node, 'instrument'): frame._dispatcher.instrumentation[node.instrument] = \ INSTRUMENTATION_PROVIDERS[node.instrument] elif hasattr(node, 'consume'): frame._dispatcher.instrumentation[node.consume.instrument] = \ INSTRUMENTATION_PROVIDERS[node.consume.instrument] elif hasattr(node, 'map'): frame._dispatcher.instrumentation[node.map.instrument] = \ INSTRUMENTATION_PROVIDERS[node.map.instrument] frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # Generate frame code (and the rest of the code) global_code, frame_code, used_targets = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame') ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) return target_objects
def timethis(sdfg, title, flop_count, f, *args, **kwargs): """ Runs a function multiple (`DACE_treps`) times, logs the running times to a file, and prints the median time (with FLOPs if given). :param sdfg: The SDFG belonging to the measurement. :param title: A title of the measurement. :param flop_count: Number of floating point operations in `program`. If greater than zero, produces a median FLOPS report. :param f: The function to measure. :param args: Arguments to invoke the function with. :param kwargs: Keyword arguments to invoke the function with. :return: Latest return value of the function. """ start = timer() REPS = int(Config.get('treps')) times = [start] * (REPS + 1) ret = None print('\nProfiling...') iterator = range(REPS) if Config.get_bool('profiling_status'): try: from tqdm import tqdm iterator = tqdm(iterator, desc="Profiling", file=sys.stdout) except ImportError: print( 'WARNING: Cannot show profiling progress, missing optional ' 'dependency tqdm...\n\tTo see a live progress bar please install ' 'tqdm (`pip install tqdm`)\n\tTo disable this feature (and ' 'this warning) set `profiling_status` to false in the dace ' 'config (~/.dace.conf).') for i in iterator: # Call function ret = f(*args, **kwargs) times[i + 1] = timer() diffs = np.array([(times[i] - times[i - 1]) for i in range(1, REPS + 1)]) problem_size = sys.argv[1] if len(sys.argv) >= 2 else 0 profiling_dir = os.path.join(sdfg.build_folder, 'profiling') os.makedirs(profiling_dir, exist_ok=True) timestamp_string = str(int(time.time() * 1000)) outfile_path = os.path.join(profiling_dir, 'results-' + timestamp_string + '.csv') with open(outfile_path, 'w') as f: f.write('Program,Optimization,Problem_Size,Runtime_sec\n') for d in diffs: f.write('%s,%s,%s,%.8f\n' % (sdfg.name, title, problem_size, d)) if flop_count > 0: gflops_arr = (flop_count / diffs) * 1e-9 time_secs = np.median(diffs) GFLOPs = (flop_count / time_secs) * 1e-9 print(title, GFLOPs, 'GFLOP/s (', time_secs * 1000, 'ms)') else: time_secs = np.median(diffs) print(title, time_secs * 1000, 'ms') return ret
def validate_state(state: 'dace.sdfg.SDFGState', state_id: int = None, sdfg: 'dace.sdfg.SDFG' = None, symbols: Dict[str, dtypes.typeclass] = None): """ Verifies the correctness of an SDFG state by applying multiple tests. Raises an InvalidSDFGError with the erroneous node on failure. """ # Avoid import loops from dace.sdfg import SDFG from dace.config import Config from dace.sdfg import nodes as nd from dace.sdfg.scope import scope_contains_scope from dace import data as dt from dace import subsets as sbs sdfg = sdfg or state.parent state_id = state_id or sdfg.node_id(state) symbols = symbols or {} if not dtypes.validate_name(state._label): raise InvalidSDFGError("Invalid state name", sdfg, state_id) if state._parent != sdfg: raise InvalidSDFGError("State does not point to the correct " "parent", sdfg, state_id) # Unreachable ######################################## if (sdfg.number_of_nodes() > 1 and sdfg.in_degree(state) == 0 and sdfg.out_degree(state) == 0): raise InvalidSDFGError("Unreachable state", sdfg, state_id) for nid, node in enumerate(state.nodes()): # Node validation try: node.validate(sdfg, state) except InvalidSDFGError: raise except Exception as ex: raise InvalidSDFGNodeError("Node validation failed: " + str(ex), sdfg, state_id, nid) from ex # Isolated nodes ######################################## if state.in_degree(node) + state.out_degree(node) == 0: # One corner case: OK if this is a code node if isinstance(node, nd.CodeNode): pass else: raise InvalidSDFGNodeError("Isolated node", sdfg, state_id, nid) # Scope tests ######################################## if isinstance(node, nd.EntryNode): try: state.exit_node(node) except StopIteration: raise InvalidSDFGNodeError( "Entry node does not have matching " "exit node", sdfg, state_id, nid, ) if isinstance(node, (nd.EntryNode, nd.ExitNode)): for iconn in node.in_connectors: if (iconn is not None and iconn.startswith("IN_") and ("OUT_" + iconn[3:]) not in node.out_connectors): raise InvalidSDFGNodeError( "No match for input connector %s in output " "connectors" % iconn, sdfg, state_id, nid, ) for oconn in node.out_connectors: if (oconn is not None and oconn.startswith("OUT_") and ("IN_" + oconn[4:]) not in node.in_connectors): raise InvalidSDFGNodeError( "No match for output connector %s in input " "connectors" % oconn, sdfg, state_id, nid, ) # Node-specific tests ######################################## if isinstance(node, nd.AccessNode): if node.data not in sdfg.arrays: raise InvalidSDFGNodeError( "Access node must point to a valid array name in the SDFG", sdfg, state_id, nid, ) arr = sdfg.arrays[node.data] # Verify View references if isinstance(arr, dt.View): from dace.sdfg import utils as sdutil # Avoid import loops if sdutil.get_view_edge(state, node) is None: raise InvalidSDFGNodeError( "Ambiguous or invalid edge to/from a View access node", sdfg, state_id, nid) # Find uninitialized transients if (arr.transient and state.in_degree(node) == 0 and state.out_degree(node) > 0 # Streams do not need to be initialized and not isinstance(arr, dt.Stream)): # Find other instances of node in predecessor states states = sdfg.predecessor_states(state) input_found = False for s in states: for onode in s.nodes(): if (isinstance(onode, nd.AccessNode) and onode.data == node.data): if s.in_degree(onode) > 0: input_found = True break if input_found: break if not input_found and node.setzero == False: warnings.warn( 'WARNING: Use of uninitialized transient "%s" in state %s' % (node.data, state.label)) # Find writes to input-only arrays only_empty_inputs = all(e.data.is_empty() for e in state.in_edges(node)) if (not arr.transient) and (not only_empty_inputs): nsdfg_node = sdfg.parent_nsdfg_node if nsdfg_node is not None: if node.data not in nsdfg_node.out_connectors: raise InvalidSDFGNodeError( 'Data descriptor %s is ' 'written to, but only given to nested SDFG as an ' 'input connector' % node.data, sdfg, state_id, nid) if (isinstance(node, nd.ConsumeEntry) and "IN_stream" not in node.in_connectors): raise InvalidSDFGNodeError( "Consume entry node must have an input stream", sdfg, state_id, nid) if (isinstance(node, nd.ConsumeEntry) and "OUT_stream" not in node.out_connectors): raise InvalidSDFGNodeError( "Consume entry node must have an internal stream", sdfg, state_id, nid, ) # Connector tests ######################################## # Check for duplicate connector names (unless it's a nested SDFG) if (len(node.in_connectors.keys() & node.out_connectors.keys()) > 0 and not isinstance(node, (nd.NestedSDFG, nd.LibraryNode))): dups = node.in_connectors.keys() & node.out_connectors.keys() raise InvalidSDFGNodeError("Duplicate connectors: " + str(dups), sdfg, state_id, nid) # Check for connectors that are also array/symbol names if isinstance(node, nd.Tasklet): for conn in node.in_connectors.keys(): if conn in sdfg.arrays or conn in symbols: raise InvalidSDFGNodeError( f"Input connector {conn} already " "defined as array or symbol", sdfg, state_id, nid) for conn in node.out_connectors.keys(): if conn in sdfg.arrays or conn in symbols: raise InvalidSDFGNodeError( f"Output connector {conn} already " "defined as array or symbol", sdfg, state_id, nid) # Check for dangling connectors (incoming) for conn in node.in_connectors: incoming_edges = 0 for e in state.in_edges(node): # Connector found if e.dst_conn == conn: incoming_edges += 1 if incoming_edges == 0: raise InvalidSDFGNodeError("Dangling in-connector %s" % conn, sdfg, state_id, nid) # Connectors may have only one incoming edge # Due to input connectors of scope exit, this is only correct # in some cases: if incoming_edges > 1 and not isinstance(node, nd.ExitNode): raise InvalidSDFGNodeError( "Connector '%s' cannot have more " "than one incoming edge, found %d" % (conn, incoming_edges), sdfg, state_id, nid, ) # Check for dangling connectors (outgoing) for conn in node.out_connectors: outgoing_edges = 0 for e in state.out_edges(node): # Connector found if e.src_conn == conn: outgoing_edges += 1 if outgoing_edges == 0: raise InvalidSDFGNodeError("Dangling out-connector %s" % conn, sdfg, state_id, nid) # In case of scope exit or code node, only one outgoing edge per # connector is allowed. if outgoing_edges > 1 and isinstance(node, (nd.ExitNode, nd.CodeNode)): raise InvalidSDFGNodeError( "Connector '%s' cannot have more " "than one outgoing edge, found %d" % (conn, outgoing_edges), sdfg, state_id, nid, ) # Check for edges to nonexistent connectors for e in state.in_edges(node): if e.dst_conn is not None and e.dst_conn not in node.in_connectors: raise InvalidSDFGNodeError( ("Memlet %s leading to " + "nonexistent connector %s") % (str(e.data), e.dst_conn), sdfg, state_id, nid, ) for e in state.out_edges(node): if e.src_conn is not None and e.src_conn not in node.out_connectors: raise InvalidSDFGNodeError( ("Memlet %s coming from " + "nonexistent connector %s") % (str(e.data), e.src_conn), sdfg, state_id, nid, ) ######################################## # Memlet checks scope = state.scope_dict() for eid, e in enumerate(state.edges()): # Edge validation try: e.data.validate(sdfg, state) except InvalidSDFGError: raise except Exception as ex: raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid) # For every memlet, obtain its full path in the DFG path = state.memlet_path(e) src_node = path[0].src dst_node = path[-1].dst # Check if memlet data matches src or dst nodes if (e.data.data is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode)) and (not isinstance(src_node, nd.AccessNode) or e.data.data != src_node.data) and (not isinstance(dst_node, nd.AccessNode) or e.data.data != dst_node.data)): raise InvalidSDFGEdgeError( "Memlet data does not match source or destination " "data nodes)", sdfg, state_id, eid, ) # Check memlet subset validity with respect to source/destination nodes if e.data.data is not None and e.data.allow_oob == False: subset_node = (dst_node if isinstance(dst_node, nd.AccessNode) and e.data.data == dst_node.data else src_node) other_subset_node = ( dst_node if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data else src_node) if isinstance(subset_node, nd.AccessNode): arr = sdfg.arrays[subset_node.data] # Dimensionality if e.data.subset.dims() != len(arr.shape): raise InvalidSDFGEdgeError( "Memlet subset does not match node dimension " "(expected %d, got %d)" % (len(arr.shape), e.data.subset.dims()), sdfg, state_id, eid, ) # Bounds if any(((minel + off) < 0) == True for minel, off in zip( e.data.subset.min_element(), arr.offset)): raise InvalidSDFGEdgeError( "Memlet subset negative out-of-bounds", sdfg, state_id, eid) if any(((maxel + off) >= s) == True for maxel, s, off in zip( e.data.subset.max_element(), arr.shape, arr.offset)): raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid) # Test other_subset as well if e.data.other_subset is not None and isinstance( other_subset_node, nd.AccessNode): arr = sdfg.arrays[other_subset_node.data] # Dimensionality if e.data.other_subset.dims() != len(arr.shape): raise InvalidSDFGEdgeError( "Memlet other_subset does not match node dimension " "(expected %d, got %d)" % (len(arr.shape), e.data.other_subset.dims()), sdfg, state_id, eid, ) # Bounds if any(((minel + off) < 0) == True for minel, off in zip( e.data.other_subset.min_element(), arr.offset)): raise InvalidSDFGEdgeError( "Memlet other_subset negative out-of-bounds", sdfg, state_id, eid, ) if any(((maxel + off) >= s) == True for maxel, s, off in zip( e.data.other_subset.max_element(), arr.shape, arr.offset)): raise InvalidSDFGEdgeError( "Memlet other_subset out-of-bounds", sdfg, state_id, eid) # Test subset and other_subset for undefined symbols if Config.get_bool('experimental', 'validate_undefs'): # TODO: Traverse by scopes and accumulate data defined_symbols = state.symbols_defined_at(e.dst) undefs = (e.data.subset.free_symbols - set(defined_symbols.keys())) if len(undefs) > 0: raise InvalidSDFGEdgeError( 'Undefined symbols %s found in memlet subset' % undefs, sdfg, state_id, eid) if e.data.other_subset is not None: undefs = (e.data.other_subset.free_symbols - set(defined_symbols.keys())) if len(undefs) > 0: raise InvalidSDFGEdgeError( 'Undefined symbols %s found in memlet ' 'other_subset' % undefs, sdfg, state_id, eid) ####################################### # Memlet path scope lifetime checks # If scope(src) == scope(dst): OK if scope[src_node] == scope[dst_node] or src_node == scope[dst_node]: pass # If scope(src) contains scope(dst), then src must be a data node, # unless the memlet is empty in order to connect to a scope elif scope_contains_scope(scope, src_node, dst_node): pass # If scope(dst) contains scope(src), then dst must be a data node, # unless the memlet is empty in order to connect to a scope elif scope_contains_scope(scope, dst_node, src_node): if not isinstance(dst_node, nd.AccessNode): if e.data.is_empty() and isinstance(dst_node, nd.ExitNode): pass else: raise InvalidSDFGEdgeError( f"Memlet creates an invalid path (sink node {dst_node}" " should be a data node)", sdfg, state_id, eid) # If scope(dst) is disjoint from scope(src), it's an illegal memlet else: raise InvalidSDFGEdgeError( "Illegal memlet between disjoint scopes", sdfg, state_id, eid) # Check dimensionality of memory access if isinstance(e.data.subset, (sbs.Range, sbs.Indices)): if e.data.subset.dims() != len(sdfg.arrays[e.data.data].shape): raise InvalidSDFGEdgeError( "Memlet subset uses the wrong dimensions" " (%dD for a %dD data node)" % (e.data.subset.dims(), len( sdfg.arrays[e.data.data].shape)), sdfg, state_id, eid, ) # Verify that source and destination subsets contain the same # number of elements if not e.data.allow_oob and e.data.other_subset is not None and not ( (isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Stream)) or (isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Stream))): if (e.data.src_subset.num_elements() * sdfg.arrays[src_node.data].veclen != e.data.dst_subset.num_elements() * sdfg.arrays[dst_node.data].veclen): raise InvalidSDFGEdgeError( 'Dimensionality mismatch between src/dst subsets', sdfg, state_id, eid)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] node_schedprop = cnode.map exit_node = graph.exit_node(cnode) else: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] node_schedprop = cnode exit_node = cnode # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] all_out_edges.extend(list(graph.out_edges(exit_node))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.scope_dict()[edge.src] for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data]
def compileProgram(request, language, perfopts=None): if not request.json or (('code' not in request.json) and ('sdfg' not in request.json)): print("[Error] No input code provided, cannot continue") abort(400) errors = [] try: optpath = request.json['optpath'] except: optpath = None try: sdfg_props = request.json['sdfg_props'] except: sdfg_props = None if perfopts is None: try: perf_mode = request.json['perf_mode'] except: perf_mode = None else: #print("Perfopts: " + str(perfopts)) perf_mode = perfopts client_id = request.json['client_id'] sdfg_dict = {} sdfg_eval_order = [] with config_lock: # Lock the config - the config may be modified while holding this lock, but the config MUST be restored. from dace.config import Config config_path = "./client_configs/" + client_id + ".conf" if os.path.isfile(config_path): Config.load(config_path) else: Config.load() dace_state = None in_sdfg = None if "sdfg" in request.json: in_sdfg = request.json['sdfg'] if isinstance(in_sdfg, list): if len(in_sdfg) > 1: # TODO: Allow multiple sdfg inputs raise NotImplementedError("More than 1 SDFG provided") in_sdfg = in_sdfg[0] if isinstance(in_sdfg, str): in_sdfg = json.loads(in_sdfg) if isinstance(in_sdfg, dict): # Generate callbacks (needed for elements referencing others) def loader_callback(name: str): # Check if already available and if yes, return it if name in sdfg_dict: return sdfg_dict[name] # Else: This function has to recreate the given sdfg sdfg_dict[name] = dace.SDFG.from_json( in_sdfg[name], { 'sdfg': None, 'callback': loader_callback }) sdfg_eval_order.append(name) return sdfg_dict[name] for k, v in in_sdfg.items(): # Leave it be if the sdfg was already created # (this might happen with SDFG references) if k in sdfg_dict: continue if isinstance(v, str): v = json.loads(v) sdfg_dict[k] = dace.SDFG.from_json( v, { 'sdfg': None, 'callback': loader_callback }) sdfg_eval_order.append(k) else: in_sdfg = dace.SDFG.from_json(in_sdfg) sdfg_dict[in_sdfg.name] = in_sdfg else: print("Using code to compile") code = request.json['code'] if (isinstance(code, list)): if len(code) > 1: print("More than 1 code file provided!") abort(400) code = code[0] if language == "octave": statements = octave_frontend.parse(code, debug=False) statements.provide_parents() statements.specialize() sdfg = statements.generate_code() sdfg.set_sourcecode(code, "matlab") elif language == "dace": dace_state = create_DaceState(code, sdfg_dict, errors) # The DaceState uses the variable names in the dace code. This is not useful enough for us, so we translate copied_dict = {} for k, v in sdfg_dict.items(): copied_dict[v.name] = v sdfg_dict = copied_dict if len(errors) == 0: if optpath is not None: for sdfg_name, op in optpath.items(): try: sp = sdfg_props[sdfg_name] except: # In any error case, just ignore the properties sp = None print("Applying opts for " + sdfg_name) print("Dict: " + str(sdfg_dict.keys())) sdfg_dict[sdfg_name] = applyOptPath(sdfg_dict[sdfg_name], op, sdfg_props=sp) code_tuple_dict = {} # Deep-copy the SDFG (codegen may change the SDFG it operates on) codegen_sdfgs = copy.deepcopy(sdfg_dict) codegen_sdfgs_dace_state = copy.deepcopy(sdfg_dict) if len(errors) == 0: if sdfg_eval_order: sdfg_eval = [(n, codegen_sdfgs[n]) for n in reversed(sdfg_eval_order)] else: sdfg_eval = codegen_sdfgs.items() for n, s in sdfg_eval: try: if Config.get_bool('diode', 'general', 'library_autoexpand'): s.expand_library_nodes() code_tuple_dict[n] = codegen.generate_code(s) except dace.sdfg.NodeNotExpandedError as ex: code_tuple_dict[n] = [str(ex)] except Exception: # Forward exception to output code code_tuple_dict[n] = [ 'Code generation failed:\n' + traceback.format_exc() ] if dace_state is None: if "code" in request.json: in_code = request.json['code'] else: in_code = "" dace_state = DaceState(in_code, "tmp.py", remote=remote_execution) dace_state.set_sdfg( list(codegen_sdfgs_dace_state.values())[0], list(codegen_sdfgs_dace_state.keys())[0]) if len(dace_state.errors) > 0: print("ERRORS: " + str(dace_state.errors)) errors.extend(dace_state.errors) # The config won't save back on its own, and we don't want it to - these changes are transient if len(errors) > 0: return errors # Only return top-level SDFG return ({k: v for k, v in sdfg_dict.items() if v.parent is None}, code_tuple_dict, dace_state)
def preprocess_dace_program( f: Callable[..., Any], argtypes: Dict[str, data.Data], global_vars: Dict[str, Any], modules: Dict[str, Any], resolve_functions: bool = False, parent_closure: Optional[SDFGClosure] = None ) -> Tuple[PreprocessedAST, SDFGClosure]: """ Preprocesses a ``@dace.program`` and all its nested functions, returning a preprocessed AST object and the closure of the resulting SDFG. :param f: A Python function to parse. :param argtypes: An dictionary of (name, type) for the given function's arguments, which may pertain to data nodes or symbols (scalars). :param global_vars: A dictionary of global variables in the closure of `f`. :param modules: A dictionary from an imported module name to the module itself. :param constants: A dictionary from a name to a constant value. :param resolve_functions: If True, treats all global functions defined outside of the program as returning constant values. :param parent_closure: If not None, represents the closure of the parent of the currently processed function. :return: A 2-tuple of the AST and its reduced (used) closure. """ src_ast, src_file, src_line, src = astutils.function_to_ast(f) # Resolve data structures src_ast = StructTransformer(global_vars).visit(src_ast) src_ast = ModuleResolver(modules).visit(src_ast) # Convert modules after resolution for mod, modval in modules.items(): if mod == 'builtins': continue newmod = global_vars[mod] #del global_vars[mod] global_vars[modval] = newmod # Resolve constants to their values (if they are not already defined in this scope) # and symbols to their names resolved = { k: v for k, v in global_vars.items() if k not in argtypes and k != '_' } closure_resolver = GlobalResolver(resolved, resolve_functions) # Append element to call stack and handle max recursion depth if parent_closure is not None: fid = id(f) if fid in parent_closure.callstack: raise DaceRecursionError(fid) if len(parent_closure.callstack) > Config.get( 'frontend', 'implicit_recursion_depth'): raise TypeError( 'Implicit (automatically parsed) recursion depth ' 'exceeded. Functions below this call will not be ' 'parsed. To change this setting, modify the value ' '`frontend.implicit_recursion_depth` in .dace.conf') closure_resolver.closure.callstack = parent_closure.callstack + [fid] passes = int(Config.get('frontend', 'preprocessing_passes')) if passes >= 0: gen = range(passes) else: # Run until the code stops changing def check_code(src_ast): old_src = ast.dump(src_ast) i = 0 while True: yield i new_src = ast.dump(src_ast) if new_src == old_src: return old_src = new_src i += 1 gen = check_code(src_ast) for pass_num in gen: try: src_ast = closure_resolver.visit(src_ast) src_ast = LoopUnroller(resolved, src_file).visit(src_ast) src_ast = ConditionalCodeResolver(resolved).visit(src_ast) src_ast = DeadCodeEliminator().visit(src_ast) except Exception: if Config.get_bool('frontend', 'verbose_errors'): print( f'VERBOSE: Failed to preprocess (pass #{pass_num}) the following program:' ) print(astutils.unparse(src_ast)) raise try: ctr = CallTreeResolver(closure_resolver.closure, resolved) ctr.visit(src_ast) except DaceRecursionError as ex: if id(f) == ex.fid: raise TypeError( 'Parsing failed due to recursion in a data-centric ' 'context called from this function') else: raise ex used_arrays = ArrayClosureResolver(closure_resolver.closure) used_arrays.visit(src_ast) # Filter out arrays that are not used after dead code elimination closure_resolver.closure.closure_arrays = { k: v for k, v in closure_resolver.closure.closure_arrays.items() if k in used_arrays.arrays } # Filter out callbacks that were removed after dead code elimination closure_resolver.closure.callbacks = { k: v for k, v in closure_resolver.closure.callbacks.items() if k in ctr.seen_calls } # Filter remaining global variables according to type and scoping rules program_globals = { k: v for k, v in global_vars.items() if k not in argtypes } # Fill in data descriptors from closure arrays argtypes.update({ arrname: v[1] for arrname, v in closure_resolver.closure.closure_arrays.items() }) # Combine nested closures with the current one closure_resolver.closure.combine_nested_closures() past = PreprocessedAST(src_file, src_line, src, src_ast, program_globals) return past, closure_resolver.closure
def configure_and_compile(program_folder, program_name=None, output_stream=None): """ Configures and compiles a DaCe program in the specified folder into a shared library file. :param program_folder: Folder containing all files necessary to build, equivalent to what was passed to `generate_program_folder`. :param output_stream: Additional output stream to write to (used for DIODE client). :return: Path to the compiled shared library file. """ if program_name is None: program_name = os.path.basename(program_folder) program_folder = os.path.abspath(program_folder) src_folder = os.path.join(program_folder, "src") # Prepare build folder build_folder = os.path.join(program_folder, "build") os.makedirs(build_folder, exist_ok=True) # Prepare performance report folder os.makedirs(os.path.join(program_folder, "perf"), exist_ok=True) # Read list of DaCe files to compile. # We do this instead of iterating over source files in the directory to # avoid globbing files from previous compilations, such that we don't need # to wipe the directory for every compilation. file_list = [ line.strip().split(",") for line in open(os.path.join(program_folder, "dace_files.csv"), "r") ] # Get absolute paths and targets for all source files files = [] targets = {} # {target name: target class} for target_name, target_type, file_name in file_list: if target_type: path = os.path.join(target_name, target_type, file_name) else: path = os.path.join(target_name, file_name) files.append(path) targets[target_name] = next( k for k, v in TargetCodeGenerator.extensions().items() if v['name'] == target_name) # Windows-only workaround: Override Visual C++'s linker to use # Multi-Threaded (MT) mode. This fixes linkage in CUDA applications where # CMake fails to do so. if os.name == 'nt': if '_CL_' not in os.environ: os.environ['_CL_'] = '/MT' elif '/MT' not in os.environ['_CL_']: os.environ['_CL_'] = os.environ['_CL_'] + ' /MT' # Start forming CMake command dace_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) cmake_command = [ "cmake", "-A x64" if os.name == 'nt' else "", # Windows-specific flag '"' + os.path.join(dace_path, "codegen") + '"', "-DDACE_SRC_DIR=\"{}\"".format(src_folder), "-DDACE_FILES=\"{}\"".format(";".join(files)), "-DDACE_PROGRAM_NAME={}".format(program_name), ] # Get required environments are retrieve the CMake information environments = set(l.strip() for l in open( os.path.join(program_folder, "dace_environments.csv"), "r")) cmake_minimum_version = [0] cmake_variables = dict() cmake_packages = set() cmake_includes = set() cmake_libraries = set() cmake_compile_flags = set() cmake_link_flags = set() cmake_files = set() cmake_module_paths = set() for env_name in environments: env = dace.library.get_environment(env_name) if (env.cmake_minimum_version is not None and len(env.cmake_minimum_version) > 0): version_list = list(map(int, env.cmake_minimum_version.split("."))) for i in range(max(len(version_list), len(cmake_minimum_version))): if i >= len(version_list): break if i >= len(cmake_minimum_version): cmake_minimum_version = version_list break if version_list[i] > cmake_minimum_version[i]: cmake_minimum_version = version_list break # Otherwise keep iterating for var in env.cmake_variables: if (var in cmake_variables and cmake_variables[var] != env.cmake_variables[var]): raise KeyError( "CMake variable {} was redefined from {} to {}.".format( var, cmake_variables[var], env.cmake_variables[var])) cmake_variables[var] = env.cmake_variables[var] cmake_packages |= set(env.cmake_packages) cmake_includes |= set(env.cmake_includes) cmake_libraries |= set(env.cmake_libraries) cmake_compile_flags |= set(env.cmake_compile_flags) cmake_link_flags |= set(env.cmake_link_flags) # Make path absolute env_dir = os.path.dirname(env._dace_file_path) cmake_files |= set( (f if os.path.isabs(f) else os.path.join(env_dir, f)) + (".cmake" if not f.endswith(".cmake") else "") for f in env.cmake_files) for header in env.headers: if os.path.isabs(header): # Giving an absolute path is not good practice, but allow it # for emergency overriding cmake_includes.add(os.path.dirname(header)) abs_path = os.path.join(env_dir, header) if os.path.isfile(abs_path): # Allow includes stored with the library, specified with a # relative path cmake_includes.add(env_dir) break environment_flags = [ "-DDACE_ENV_MINIMUM_VERSION={}".format(".".join( map(str, cmake_minimum_version))), # Make CMake list of key-value pairs "-DDACE_ENV_VAR_KEYS=\"{}\"".format(";".join(cmake_variables.keys())), "-DDACE_ENV_VAR_VALUES=\"{}\"".format(";".join( cmake_variables.values())), "-DDACE_ENV_PACKAGES=\"{}\"".format(" ".join(cmake_packages)), "-DDACE_ENV_INCLUDES=\"{}\"".format(" ".join(cmake_includes)), "-DDACE_ENV_LIBRARIES=\"{}\"".format(" ".join(cmake_libraries)), "-DDACE_ENV_COMPILE_FLAGS=\"{}\"".format( " ".join(cmake_compile_flags)), # "-DDACE_ENV_LINK_FLAGS=\"{}\"".format(" ".join(cmake_link_flags)), "-DDACE_ENV_CMAKE_FILES=\"{}\"".format(";".join(cmake_files)), ] # Escape variable expansions to defer their evaluation environment_flags = [ cmd.replace("$", "_DACE_CMAKE_EXPAND") for cmd in environment_flags ] cmake_command += environment_flags # Replace backslashes with forward slashes cmake_command = [cmd.replace('\\', '/') for cmd in cmake_command] # Generate CMake options for each compiler libraries = set() for target_name, target in targets.items(): try: cmake_command += target.cmake_options() libraries |= unique_flags( Config.get("compiler", target_name, "libs")) except KeyError: pass except ValueError as ex: # Cannot find compiler executable raise CompilerConfigurationError(str(ex)) cmake_command.append("-DDACE_LIBS=\"{}\"".format(" ".join(libraries))) # Override linker and linker arguments if Config.get('compiler', 'linker', 'executable'): cmake_command.append("-DCMAKE_LINKER=\"{}\"".format( make_absolute(Config.get('compiler', 'linker', 'executable')))) if Config.get('compiler', 'linker', 'args'): cmake_command.append( "-DCMAKE_SHARED_LINKER_FLAGS=\"{}\"".format( Config.get('compiler', 'linker', 'args') + " " + " ".join(cmake_link_flags)), ) cmake_command = ' '.join(cmake_command) cmake_filename = os.path.join(build_folder, 'cmake_configure.sh') ############################################## # Configure try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # Clean CMake directory and try once more if Config.get_bool('debugprint'): print('Cleaning CMake build folder and retrying...') shutil.rmtree(build_folder) os.makedirs(build_folder) try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # If still unsuccessful, print results if Config.get_bool('debugprint'): raise CompilerConfigurationError('Configuration failure') else: raise CompilerConfigurationError('Configuration failure:\n' + ex.output) with open(cmake_filename, "w") as fp: fp.write(cmake_command) # Compile and link try: _run_liveoutput("cmake --build . --config %s" % (Config.get('compiler', 'build_type')), shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # If unsuccessful, print results if Config.get_bool('debugprint'): raise CompilationError('Compiler failure') else: raise CompilationError('Compiler failure:\n' + ex.output) shared_library_path = os.path.join( build_folder, "lib{}.{}".format(program_name, Config.get('compiler', 'library_extension'))) return shared_library_path
def apply(self, sdfg): first_state = sdfg.nodes()[self.subgraph[StateFusion._first_state]] second_state = sdfg.nodes()[self.subgraph[StateFusion._second_state]] # Remove interstate edge(s) edges = sdfg.edges_between(first_state, second_state) for edge in edges: if edge.data.assignments: for src, dst, other_data in sdfg.in_edges(first_state): other_data.assignments.update(edge.data.assignments) sdfg.remove_edge(edge) # Special case 1: first state is empty if first_state.is_empty(): sdutil.change_edge_dest(sdfg, first_state, second_state) sdfg.remove_node(first_state) return # Special case 2: second state is empty if second_state.is_empty(): sdutil.change_edge_src(sdfg, second_state, first_state) sdutil.change_edge_dest(sdfg, second_state, first_state) sdfg.remove_node(second_state) return # Normal case: both states are not empty # Find source/sink (data) nodes first_input = [ node for node in sdutil.find_source_nodes(first_state) if isinstance(node, nodes.AccessNode) ] first_output = [ node for node in sdutil.find_sink_nodes(first_state) if isinstance(node, nodes.AccessNode) ] second_input = [ node for node in sdutil.find_source_nodes(second_state) if isinstance(node, nodes.AccessNode) ] # first input = first input - first output first_input = [ node for node in first_input if next((x for x in first_output if x.label == node.label), None) is None ] # Merge second state to first state # First keep a backup of the topological sorted order of the nodes order = [ x for x in reversed(list(nx.topological_sort(first_state._nx))) if isinstance(x, nodes.AccessNode) ] for node in second_state.nodes(): first_state.add_node(node) for src, src_conn, dst, dst_conn, data in second_state.edges(): first_state.add_edge(src, src_conn, dst, dst_conn, data) # Merge common (data) nodes for node in second_input: if first_state.in_degree(node) == 0: n = next((x for x in order if x.label == node.label), None) if n: sdutil.change_edge_src(first_state, node, n) first_state.remove_node(node) n.access = dtypes.AccessType.ReadWrite # Redirect edges and remove second state sdutil.change_edge_src(sdfg, second_state, first_state) sdfg.remove_node(second_state) if Config.get_bool("debugprint"): StateFusion._states_fused += 1
def generate_code(sdfg) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :return: List of code objects that correspond to files to compile. """ # Before compiling, validate SDFG correctness sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp import shutil import tempfile with tempfile.TemporaryDirectory() as tmp_dir: sdfg.save(f'{tmp_dir}/test.sdfg') sdfg2 = SDFG.from_file(f'{tmp_dir}/test.sdfg') sdfg2.save(f'{tmp_dir}/test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp(f'{tmp_dir}/test.sdfg', f'{tmp_dir}/test2.sdfg'): shutil.move(f"{tmp_dir}/test.sdfg", "test.sdfg") shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg") raise RuntimeError( 'SDFG serialization failed - files do not match') # Run with the deserialized version # NOTE: This means that all subsequent modifications to `sdfg` # are not reflected outside of this function (e.g., library # node expansion). sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) # Set default storage/schedule types in SDFG infer_types.set_default_schedule_and_storage_types(sdfg, None) # Recursively expand library nodes that have not yet been expanded sdfg.expand_library_nodes() # After expansion, run another pass of connector/type inference infer_types.infer_connector_types(sdfg) infer_types.set_default_schedule_and_storage_types(sdfg, None) frame = framecode.DaCeCodeGenerator() # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in target.TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in target.TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Instantiate all instrumentation providers in SDFG provider_mapping = InstrumentationProvider.get_provider_mapping() frame._dispatcher.instrumentation[ dtypes.InstrumentationType.No_Instrumentation] = None for node, _ in sdfg.all_nodes_recursive(): if hasattr(node, 'instrument'): frame._dispatcher.instrumentation[node.instrument] = \ provider_mapping[node.instrument] elif hasattr(node, 'consume'): frame._dispatcher.instrumentation[node.consume.instrument] = \ provider_mapping[node.consume.instrument] elif hasattr(node, 'map'): frame._dispatcher.instrumentation[node.map.instrument] = \ provider_mapping[node.map.instrument] if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: frame._dispatcher.instrumentation[sdfg.instrument] = \ provider_mapping[sdfg.instrument] frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg), 'h', cpu.CPUCodeGen, 'CallHeader', target_type='../../include', linkable=False) target_objects.append(dummy) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg), 'cpp', cpu.CPUCodeGen, 'SampleMain', target_type='../../sample', linkable=False) target_objects.append(dummy) return target_objects
def _construct_args(self, kwargs) -> Tuple[Tuple[Any], Tuple[Any]]: """ Main function that controls argument construction for calling the C prototype of the SDFG. Organizes arguments first by `sdfg.arglist`, then data descriptors by alphabetical order, then symbols by alphabetical order. """ # Return value initialization (for values that have not been given) self._initialize_return_values(kwargs) if self._return_arrays is not None: if len(self._retarray_shapes) == 1: kwargs[self._retarray_shapes[0][0]] = self._return_arrays else: for desc, arr in zip(self._retarray_shapes, self._return_arrays): kwargs[desc[0]] = arr # Argument construction sig = self._sig typedict = self._typedict if len(kwargs) > 0: # Construct mapping from arguments to signature arglist = [] argtypes = [] argnames = [] for a in sig: try: arglist.append(kwargs[a]) argtypes.append(typedict[a]) argnames.append(a) except KeyError: raise KeyError("Missing program argument \"{}\"".format(a)) else: arglist = [] argtypes = [] argnames = [] sig = [] # Type checking for a, arg, atype in zip(argnames, arglist, argtypes): if not dtypes.is_array(arg) and isinstance(atype, dt.Array): if isinstance(arg, list): print('WARNING: Casting list argument "%s" to ndarray' % a) elif arg is None: # None values are passed as null pointers pass else: raise TypeError( 'Passing an object (type %s) to an array in argument "%s"' % (type(arg).__name__, a)) elif dtypes.is_array(arg) and not isinstance(atype, dt.Array): # GPU scalars are pointers, so this is fine if atype.storage != dtypes.StorageType.GPU_Global: raise TypeError( 'Passing an array to a scalar (type %s) in argument "%s"' % (atype.dtype.ctype, a)) elif not isinstance(atype, dt.Array) and not isinstance( atype.dtype, dtypes.callback) and not isinstance( arg, (atype.dtype.type, sp.Basic)) and not (isinstance(arg, symbolic.symbol) and arg.dtype == atype.dtype): if isinstance(arg, int) and atype.dtype.type == np.int64: pass elif isinstance(arg, float) and atype.dtype.type == np.float64: pass elif (isinstance(arg, int) and atype.dtype.type == np.int32 and abs(arg) <= (1 << 31) - 1): pass elif (isinstance(arg, int) and atype.dtype.type == np.uint32 and arg >= 0 and arg <= (1 << 32) - 1): pass else: print( 'WARNING: Casting scalar argument "%s" from %s to %s' % (a, type(arg).__name__, atype.dtype.type)) elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and atype.dtype.as_numpy_dtype() != arg.dtype): # Make exception for vector types if (isinstance(atype.dtype, dtypes.vector) and atype.dtype.vtype.as_numpy_dtype() == arg.dtype): pass else: print( 'WARNING: Passing %s array argument "%s" to a %s array' % (arg.dtype, a, atype.dtype.type.__name__)) elif (isinstance(atype, dt.Array) and isinstance(arg, np.ndarray) and arg.base is not None and not '__return' in a and not Config.get_bool('compiler', 'allow_view_arguments')): raise TypeError( 'Passing a numpy view (e.g., sub-array or "A.T") to DaCe ' 'programs is not allowed in order to retain analyzability. ' 'Please make a copy with "numpy.copy(...)". If you know what ' 'you are doing, you can override this error in the ' 'configuration by setting compiler.allow_view_arguments ' 'to True.') # Explicit casting for index, (arg, argtype) in enumerate(zip(arglist, argtypes)): # Call a wrapper function to make NumPy arrays from pointers. if isinstance(argtype.dtype, dtypes.callback): arglist[index] = argtype.dtype.get_trampoline(arg, kwargs) # List to array elif isinstance(arg, list) and isinstance(argtype, dt.Array): arglist[index] = np.array(arg, dtype=argtype.dtype.type) # Null pointer elif arg is None and isinstance(argtype, dt.Array): arglist[index] = ctypes.c_void_p(0) # Retain only the element datatype for upcoming checks and casts arg_ctypes = [t.dtype.as_ctypes() for t in argtypes] sdfg = self._sdfg # Obtain SDFG constants constants = sdfg.constants # Remove symbolic constants from arguments callparams = tuple( (arg, actype, atype) for arg, actype, atype in zip(arglist, arg_ctypes, argtypes) if not symbolic.issymbolic(arg) or ( hasattr(arg, 'name') and arg.name not in constants)) # Replace symbols with their values callparams = tuple( (actype(arg.get()), actype, atype) if isinstance(arg, symbolic.symbol) else (arg, actype, atype) for arg, actype, atype in callparams) # Replace arrays with their base host/device pointers newargs = tuple( (ctypes.c_void_p(_array_interface_ptr(arg, atype)), actype, atype) if dtypes.is_array(arg) else (arg, actype, atype) for arg, actype, atype in callparams) initargs = tuple(atup for atup in callparams if not dtypes.is_array(atup[0])) newargs = tuple( actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg for arg, actype, atype in newargs) initargs = tuple( actype(arg) if (not isinstance(arg, ctypes._SimpleCData)) else arg for arg, actype, atype in initargs) self._lastargs = newargs, initargs return self._lastargs
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, nodes.EmptyTasklet): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None and e.data.wcr_identity is None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append((e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + inodename, newdesc) cloned_arrays[inodename] = 'gpu_' + inodename for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + onodename, newdesc) cloned_arrays[onodename] = 'gpu_' + onodename # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, ed.InterstateEdge()) for nname, desc in set(input_nodes): if nname in excluded_copyin: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode( cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, ed.InterstateEdge()) for nname, desc in set(output_nodes): if nname in excluded_copyout: continue src_array = nodes.AccessNode( cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue if sdict[node] is None: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if (self.toplevel_trans and not isinstance(nodedesc, data.Stream)): nodedesc.toplevel = True else: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: # Create map and connectors me, mx = state.add_map( gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = set('IN_' + e.dst_conn for e in in_edges) me.out_connectors = set('OUT_' + e.dst_conn for e in in_edges) mx.in_connectors = set('IN_' + e.src_conn for e in out_edges) mx.out_connectors = set('OUT_' + e.src_conn for e in out_edges) # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.EmptyMemlet()) ####################################################### # Step 6: Change all top-level maps to GPU maps for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.EntryNode): if sdict[node] is None: node.schedule = dtypes.ScheduleType.GPU_Device elif self.sequential_innermaps: node.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.condition_symbols()) & set(cloned_arrays.keys()) ) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): nxutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, ed.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode( cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode( nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. opt = optimizer.SDFGOptimizer(sdfg, inplace=True) fusions = 0 arrays = 0 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] while options: ssdfg = sdfg.sdfg_list[options[0].sdfg_id] options[0].apply(ssdfg) ssdfg.validate() if isinstance(options[0], StateFusion): fusions += 1 if isinstance(options[0], RedundantArray): arrays += 1 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] if Config.get_bool('debugprint') and (fusions > 0 or arrays > 0): print('Automatically applied {} strict state fusions and removed' ' {} redundant arrays.'.format(fusions, arrays))
def configure_and_compile(program_folder, program_name=None, output_stream=None): """ Configures and compiles a DaCe program in the specified folder into a shared library file. :param program_folder: Folder containing all files necessary to build, equivalent to what was passed to `generate_program_folder`. :param output_stream: Additional output stream to write to (used for DIODE client). :return: Path to the compiled shared library file. """ if program_name is None: program_name = os.path.basename(program_folder) program_folder = os.path.abspath(program_folder) src_folder = os.path.join(program_folder, "src") # Prepare build folder build_folder = os.path.join(program_folder, "build") os.makedirs(build_folder, exist_ok=True) # Prepare performance report folder os.makedirs(os.path.join(program_folder, "perf"), exist_ok=True) # Read list of DaCe files to compile. # We do this instead of iterating over source files in the directory to # avoid globbing files from previous compilations, such that we don't need # to wipe the directory for every compilation. file_list = [ line.strip().split(",") for line in open(os.path.join(program_folder, "dace_files.csv"), "r") ] # Get absolute paths and targets for all source files files = [] targets = {} # {target name: target class} for target_name, target_type, file_name in file_list: if target_type: path = os.path.join(target_name, target_type, file_name) else: path = os.path.join(target_name, file_name) files.append(path) targets[target_name] = next( k for k, v in TargetCodeGenerator.extensions().items() if v['name'] == target_name) # Windows-only workaround: Override Visual C++'s linker to use # Multi-Threaded (MT) mode. This fixes linkage in CUDA applications where # CMake fails to do so. if os.name == 'nt': if '_CL_' not in os.environ: os.environ['_CL_'] = '/MT' elif '/MT' not in os.environ['_CL_']: os.environ['_CL_'] = os.environ['_CL_'] + ' /MT' # Start forming CMake command dace_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) cmake_command = [ "cmake", "-A x64" if os.name == 'nt' else "", # Windows-specific flag '"' + os.path.join(dace_path, "codegen") + '"', "-DDACE_SRC_DIR=\"{}\"".format(src_folder), "-DDACE_FILES=\"{}\"".format(";".join(files)), "-DDACE_PROGRAM_NAME={}".format(program_name), ] # Get required environments are retrieve the CMake information environments = set(l.strip() for l in open( os.path.join(program_folder, "dace_environments.csv"), "r")) environments = dace.library.get_environments_and_dependencies(environments) environment_flags, cmake_link_flags = get_environment_flags(environments) cmake_command += environment_flags # Replace backslashes with forward slashes cmake_command = [cmd.replace('\\', '/') for cmd in cmake_command] # Generate CMake options for each compiler libraries = set() for target_name, target in targets.items(): try: cmake_command += target.cmake_options() libraries |= unique_flags( Config.get("compiler", target_name, "libs")) except KeyError: pass except ValueError as ex: # Cannot find compiler executable raise cgx.CompilerConfigurationError(str(ex)) cmake_command.append("-DDACE_LIBS=\"{}\"".format(" ".join(libraries))) # Override linker and linker arguments if Config.get('compiler', 'linker', 'executable'): cmake_command.append("-DCMAKE_LINKER=\"{}\"".format( make_absolute(Config.get('compiler', 'linker', 'executable')))) if Config.get('compiler', 'linker', 'args') is not None: cmake_command.append( "-DCMAKE_SHARED_LINKER_FLAGS=\"{}\"".format( Config.get('compiler', 'linker', 'args') + " " + " ".join(cmake_link_flags)), ) cmake_command = ' '.join(cmake_command) cmake_filename = os.path.join(build_folder, 'cmake_configure.sh') ############################################## # Configure try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # Clean CMake directory and try once more if Config.get_bool('debugprint'): print('Cleaning CMake build folder and retrying...') shutil.rmtree(build_folder) os.makedirs(build_folder) try: _run_liveoutput(cmake_command, shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # If still unsuccessful, print results if Config.get_bool('debugprint'): raise cgx.CompilerConfigurationError('Configuration failure') else: raise cgx.CompilerConfigurationError( 'Configuration failure:\n' + ex.output) with open(cmake_filename, "w") as fp: fp.write(cmake_command) # Compile and link try: _run_liveoutput("cmake --build . --config %s" % (Config.get('compiler', 'build_type')), shell=True, cwd=build_folder, output_stream=output_stream) except subprocess.CalledProcessError as ex: # If unsuccessful, print results if Config.get_bool('debugprint'): raise cgx.CompilationError('Compiler failure') else: raise cgx.CompilationError('Compiler failure:\n' + ex.output) shared_library_path = os.path.join( build_folder, "lib{}.{}".format(program_name, Config.get('compiler', 'library_extension'))) return shared_library_path
def __init__(self, base_indentation=0): super(CodeIOStream, self).__init__() self._indent = 0 self._spaces = int(Config.get('compiler', 'indentation_spaces')) self._lineinfo = Config.get_bool('compiler', 'codegen_lineinfo')
def generate_code(sdfg) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :return: List of code objects that correspond to files to compile. """ # Before compiling, validate SDFG correctness sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp sdfg.save('test.sdfg') sdfg2 = SDFG.from_file('test.sdfg') sdfg2.save('test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp('test.sdfg', 'test2.sdfg'): raise RuntimeError( 'SDFG serialization failed - files do not match') os.remove('test.sdfg') os.remove('test2.sdfg') # Run with the deserialized version sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_connector_types(sdfg) frame = framecode.DaCeCodeGenerator() # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in target.TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in target.TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Instantiate all instrumentation providers in SDFG provider_mapping = InstrumentationProvider.get_provider_mapping() frame._dispatcher.instrumentation[ dtypes.InstrumentationType.No_Instrumentation] = None for node, _ in sdfg.all_nodes_recursive(): if hasattr(node, 'instrument'): frame._dispatcher.instrumentation[node.instrument] = \ provider_mapping[node.instrument] elif hasattr(node, 'consume'): frame._dispatcher.instrumentation[node.consume.instrument] = \ provider_mapping[node.consume.instrument] elif hasattr(node, 'map'): frame._dispatcher.instrumentation[node.map.instrument] = \ provider_mapping[node.map.instrument] frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg), 'h', cpu.CPUCodeGen, 'CallHeader', linkable=False) target_objects.append(dummy) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg), 'cpp', cpu.CPUCodeGen, 'DummyMain', linkable=False) target_objects.append(dummy) return target_objects
def configure_and_compile(program_folder, program_name=None): """ Configures and compiles a DaCe program in the specified folder into a shared library file. @param program_folder: Folder containing all files necessary to build, equivalent to what was passed to `generate_program_folder`. @return: Path to the compiled shared library file. """ if program_name is None: program_name = os.path.basename(program_folder) program_folder = os.path.abspath(program_folder) src_folder = os.path.join(program_folder, "src") # Prepare build folder build_folder = os.path.join(program_folder, "build") try: os.makedirs(build_folder) except FileExistsError: pass # Read list of DaCe files to compile. # We do this instead of iterating over source files in the directory to # avoid globbing files from previous compilations, such that we don't need # to wipe the directory for every compilation. file_list = [ line.strip().split(",") for line in open(os.path.join(program_folder, "dace_files.csv"), "r") ] # Get absolute paths and targets for all source files files = [] targets = {} # {target name: target class} for target_name, file_name in file_list: path = os.path.join(src_folder, target_name, file_name) files.append(path) targets[target_name] = codegen.STRING_TO_TARGET[target_name] # Start forming CMake command dace_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) cmake_command = [ "cmake", "-A x64" if os.name == 'nt' else "", # Windows-specific flag '"' + os.path.join(dace_path, "codegen") + '"', "-DDACE_FILES=\"{}\"".format(";".join(files)), "-DDACE_PROGRAM_NAME={}".format(program_name), ] # Replace backslashes with forward slashes cmake_command = [cmd.replace('\\', '/') for cmd in cmake_command] # Generate CMake options for each compiler libraries = set() for target_name, target in targets.items(): cmake_command += target.cmake_options() try: libraries |= unique_flags( Config.get("compiler", target_name, "libs")) except KeyError: pass # TODO: it should be possible to use the default arguments/compilers # found by CMake cmake_command += [ "-DDACE_LIBS=\"{}\"".format(" ".join(libraries)), "-DCMAKE_LINKER=\"{}\"".format( make_absolute(Config.get('compiler', 'linker', 'executable'))), "-DCMAKE_SHARED_LINKER_FLAGS=\"{}\"".format( Config.get('compiler', 'linker', 'args') + Config.get('compiler', 'linker', 'additional_args')), ] ############################################## # Configure try: _run_liveoutput(" ".join(cmake_command), shell=True, cwd=build_folder) except subprocess.CalledProcessError as ex: # Clean CMake directory and try once more if Config.get_bool('debugprint'): print('Cleaning CMake build folder and retrying...') shutil.rmtree(build_folder) os.makedirs(build_folder) try: _run_liveoutput( " ".join(cmake_command), shell=True, cwd=build_folder) except subprocess.CalledProcessError as ex: # If still unsuccessful, print results if Config.get_bool('debugprint'): raise CompilerConfigurationError('Configuration failure') else: raise CompilerConfigurationError('Configuration failure:\n' + ex.output) # Compile and link try: _run_liveoutput( "cmake --build . --config %s" % (Config.get( 'compiler', 'build_type')), shell=True, cwd=build_folder) except subprocess.CalledProcessError as ex: # If unsuccessful, print results if Config.get_bool('debugprint'): raise CompilationError('Compiler failure') else: raise CompilationError('Compiler failure:\n' + ex.output) shared_library_path = os.path.join( build_folder, "lib{}.{}".format( program_name, Config.get('compiler', 'library_extension'))) return shared_library_path
def apply(self, sdfg): first_state = sdfg.nodes()[self.subgraph[StateFusion._first_state]] second_state = sdfg.nodes()[self.subgraph[StateFusion._second_state]] # Remove interstate edge(s) edges = sdfg.edges_between(first_state, second_state) for edge in edges: if edge.data.assignments: for src, dst, other_data in sdfg.in_edges(first_state): other_data.assignments.update(edge.data.assignments) sdfg.remove_edge(edge) # Special case 1: first state is empty if first_state.is_empty(): nxutil.change_edge_dest(sdfg, first_state, second_state) sdfg.remove_node(first_state) return # Special case 2: second state is empty if second_state.is_empty(): nxutil.change_edge_src(sdfg, second_state, first_state) nxutil.change_edge_dest(sdfg, second_state, first_state) sdfg.remove_node(second_state) return # Normal case: both states are not empty # Find source/sink (data) nodes first_input = [ node for node in nxutil.find_source_nodes(first_state) if isinstance(node, nodes.AccessNode) ] first_output = [ node for node in nxutil.find_sink_nodes(first_state) if isinstance(node, nodes.AccessNode) ] second_input = [ node for node in nxutil.find_source_nodes(second_state) if isinstance(node, nodes.AccessNode) ] # first input = first input - first output first_input = [ node for node in first_input if next((x for x in first_output if x.label == node.label), None) is None ] # Merge second state to first state for node in second_state.nodes(): first_state.add_node(node) for src, src_conn, dst, dst_conn, data in second_state.edges(): first_state.add_edge(src, src_conn, dst, dst_conn, data) # Merge common (data) nodes for node in first_input: try: old_node = next(x for x in second_input if x.label == node.label) except StopIteration: continue nxutil.change_edge_src(first_state, old_node, node) first_state.remove_node(old_node) second_input.remove(old_node) for node in first_output: try: new_node = next(x for x in second_input if x.label == node.label) except StopIteration: continue nxutil.change_edge_dest(first_state, node, new_node) first_state.remove_node(node) second_input.remove(new_node) # Redirect edges and remove second state nxutil.change_edge_src(sdfg, second_state, first_state) sdfg.remove_node(second_state) if Config.get_bool("debugprint"): StateFusion._states_fused += 1
def generate_code(sdfg, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :param validate: If True, validates the SDFG before generating the code. :return: List of code objects that correspond to files to compile. """ from dace.codegen.targets.target import TargetCodeGenerator # Avoid import loop # Before compiling, validate SDFG correctness if validate: sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp import shutil import tempfile with tempfile.TemporaryDirectory() as tmp_dir: sdfg.save(f'{tmp_dir}/test.sdfg') sdfg2 = SDFG.from_file(f'{tmp_dir}/test.sdfg') sdfg2.save(f'{tmp_dir}/test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp(f'{tmp_dir}/test.sdfg', f'{tmp_dir}/test2.sdfg'): shutil.move(f"{tmp_dir}/test.sdfg", "test.sdfg") shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg") raise RuntimeError( 'SDFG serialization failed - files do not match') # Run with the deserialized version # NOTE: This means that all subsequent modifications to `sdfg` # are not reflected outside of this function (e.g., library # node expansion). sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) # Set default storage/schedule types in SDFG infer_types.set_default_schedule_and_storage_types(sdfg, None) # Recursively expand library nodes that have not yet been expanded sdfg.expand_library_nodes() # After expansion, run another pass of connector/type inference infer_types.infer_connector_types(sdfg) infer_types.set_default_schedule_and_storage_types(sdfg, None) frame = framecode.DaCeCodeGenerator(sdfg) # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Query all code generation targets and instrumentation providers in SDFG _get_codegen_targets(sdfg, frame) # Preprocess SDFG for target in frame.targets: target.preprocess(sdfg) # Instantiate instrumentation providers frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # NOTE: THE SDFG IS ASSUMED TO BE FROZEN (not change) FROM THIS POINT ONWARDS # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments, sdfg=sdfg) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # Ensure that no new targets were dynamically added assert frame._dispatcher.used_targets == (frame.targets - {frame}) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg, frame), 'h', cpu.CPUCodeGen, 'CallHeader', target_type='../../include', linkable=False) target_objects.append(dummy) for env in dace.library.get_environments_and_dependencies( used_environments): if hasattr(env, "codeobjects"): target_objects.extend(env.codeobjects) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg, frame), 'cpp', cpu.CPUCodeGen, 'SampleMain', target_type='../../sample', linkable=False) target_objects.append(dummy) return target_objects
def optimize(self): """ A command-line UI for applying patterns on the SDFG. :return: An optimized SDFG object """ sdfg_file = self.sdfg.name + '.sdfg' if os.path.isfile(sdfg_file): ui_input = input('An SDFG with the filename "%s" was found. ' 'Would you like to use it instead? [Y/n] ' % sdfg_file) if len(ui_input) == 0 or ui_input[0] not in ['n', 'N']: return dace.SDFG.from_file(sdfg_file) # Visualize SDFGs during optimization process VISUALIZE_SDFV = Config.get_bool('optimizer', 'visualize_sdfv') SAVE_INTERMEDIATE = Config.get_bool('optimizer', 'save_intermediate') if SAVE_INTERMEDIATE: self.sdfg.save(os.path.join('_dacegraphs', 'before.sdfg')) if VISUALIZE_SDFV: from diode import sdfv sdfv.view(os.path.join('_dacegraphs', 'before.sdfg')) # Optimize until there is not pattern matching or user stops the process. pattern_counter = 0 while True: # Print in the UI all the pattern matching options. ui_options = sorted(self.get_pattern_matches()) ui_options_idx = 0 for pattern_match in ui_options: sdfg = self.sdfg.sdfg_list[pattern_match.sdfg_id] print('%d. Transformation %s' % (ui_options_idx, pattern_match.print_match(sdfg))) ui_options_idx += 1 # If no pattern matchings were found, quit. if ui_options_idx == 0: print('No viable transformations found') break ui_input = input( 'Select the pattern to apply (0 - %d or name$id): ' % (ui_options_idx - 1)) pattern_name, occurrence, param_dict = _parse_cli_input(ui_input) pattern_match = None if (pattern_name is None and occurrence >= 0 and occurrence < ui_options_idx): pattern_match = ui_options[occurrence] elif pattern_name is not None: counter = 0 for match in ui_options: if type(match).__name__ == pattern_name: if occurrence == counter: pattern_match = match break counter = counter + 1 if pattern_match is None: print( 'You did not select a valid option. Quitting optimization ...' ) break match_id = (str(occurrence) if pattern_name is None else '%s$%d' % (pattern_name, occurrence)) sdfg = self.sdfg.sdfg_list[pattern_match.sdfg_id] print('You selected (%s) pattern %s with parameters %s' % (match_id, pattern_match.print_match(sdfg), str(param_dict))) # Set each parameter of the parameter dictionary separately for k, v in param_dict.items(): setattr(pattern_match, k, v) pattern_match.apply(sdfg) self.applied_patterns.add(type(pattern_match)) if SAVE_INTERMEDIATE: filename = 'after_%d_%s_b4lprop' % ( pattern_counter + 1, type(pattern_match).__name__) self.sdfg.save(os.path.join('_dacegraphs', filename + '.sdfg')) if not pattern_match.annotates_memlets(): propagation.propagate_memlets_sdfg(self.sdfg) if True: pattern_counter += 1 if SAVE_INTERMEDIATE: filename = 'after_%d_%s' % (pattern_counter, type(pattern_match).__name__) self.sdfg.save( os.path.join('_dacegraphs', filename + '.sdfg')) if VISUALIZE_SDFV: from diode import sdfv sdfv.view( os.path.join('_dacegraphs', filename + '.sdfg')) return self.sdfg
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformMap._maps_transformed += 1 gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack #, dtypes.StorageType.CPU_Pinned ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() out_streamarrays = {} for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if isinstance(data_node.desc(sdfg), data.Scalar): continue if data_node.desc(sdfg).storage not in gpu_storage_types: # Stream directly connected to an array if sd.is_array_stream_view(sdfg, graph, data_node): datadesc = data_node.desc(sdfg) if datadesc.transient is False: raise TypeError('Non-transient stream-array view are ' 'unsupported') # Add parent node to clone out_arrays_to_clone.add(graph.out_edges(data_node)[0].dst) out_streamarrays[graph.out_edges(data_node) [0].dst] = data_node # Do not clone stream continue out_arrays_to_clone.add(data_node) if Config.get_bool("debugprint"): GPUTransformMap._arrays_removed += len(in_arrays_to_clone) + len( out_arrays_to_clone) # Second, create a GPU clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: cloned_array = array.clone() cloned_array.storage = dtypes.StorageType.GPU_Global cloned_array.transient = True sdfg.add_datadesc('gpu_' + array_node.data, cloned_array) cloned_arrays[array_node.data] = 'gpu_' + array_node.data cloned_node = type(array_node)('gpu_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(cnode): if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(edge.src, None, node, None, edge.data) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = edge.data.subset graph.add_edge(node, None, edge.dst, None, edge.data) # Reconnect stream-arrays for array_node, streamnode in out_streamarrays.items(): # Set stream storage to GPU streamnode.desc(sdfg).storage = dtypes.StorageType.GPU_Global cloned_node = out_cloned_arraynodes[array_node.data] e = graph.out_edges(streamnode)[0] graph.remove_edge(e) newmemlet = copy.copy(e.data) newmemlet.data = cloned_node.data # stream -> cloned array graph.add_edge(e.src, e.src_conn, cloned_node, e.dst_conn, newmemlet) # cloned array -> array graph.add_nedge(cloned_node, array_node, e.data) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data]