class BlockGather(MPINode): # Global properties implementations = { "MPI": ExpandBlockGatherMPI, } default_implementation = "MPI" subarray_type = properties.Property(dtype=str, default='tmp') gather_grid = properties.Property(dtype=str, default='tmp') reduce_grid = properties.Property(dtype=str, allow_none=True, default=None) def __init__(self, name, subarray_type='tmp', gather_grid='tmp', reduce_grid=None, *args, **kwargs): super().__init__(name, *args, inputs={"_inp_buffer"}, outputs={"_out_buffer"}, **kwargs) self.subarray_type = subarray_type self.gather_grid = gather_grid self.reduce_grid = reduce_grid def validate(self, sdfg, state): """ :return: A three-tuple (inbuffer, outbuffer, root) of the three data descriptors in the parent SDFG. """ inp_buffer, out_buffer = None, None for e in state.out_edges(self): if e.src_conn == "_out_buffer": out_buffer = sdfg.arrays[e.data.data] for e in state.in_edges(self): if e.dst_conn == "_inp_buffer": inp_buffer = sdfg.arrays[e.data.data] return inp_buffer, out_buffer
class Redistribute(MPINode): # Global properties implementations = { "MPI": ExpandRedistribute, } default_implementation = "MPI" redistr = properties.Property(dtype=str, default='tmp') def __init__(self, name, redistr='tmp', *args, **kwargs): super().__init__(name, *args, inputs={"_inp_buffer"}, outputs={"_out_buffer"}, **kwargs) self.redistr = redistr def validate(self, sdfg, state): """ :return: A three-tuple (inbuffer, outbuffer, root) of the three data descriptors in the parent SDFG. """ inp_buffer, out_buffer = None, None for e in state.out_edges(self): if e.src_conn == "_out_buffer": out_buffer = sdfg.arrays[e.data.data] for e in state.in_edges(self): if e.dst_conn == "_inp_buffer": inp_buffer = sdfg.arrays[e.data.data] return inp_buffer, out_buffer
class FPGATransformSDFG(transformation.Transformation): """ Implements the FPGATransformSDFG transformation, which takes an entire SDFG and transforms it into an FPGA-capable SDFG. """ promote_global_trans = properties.Property( dtype=bool, default=True, desc= "If True, transient arrays that are fully internal are pulled out so " "that they can be allocated on the host.") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): # Match anything return [nx.DiGraph()] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): # Avoid import loops from dace.transformation.interstate import FPGATransformState # Condition match depends on matching FPGATransformState for each state for state_id, state in enumerate(sdfg.nodes()): candidate = {FPGATransformState._state: state_id} if not FPGATransformState.can_be_applied(sdfg, candidate, expr_index, sdfg): return False return True @staticmethod def match_to_str(graph, candidate): return graph.label def apply(self, sdfg): # Avoid import loops from dace.transformation.interstate import NestSDFG from dace.transformation.interstate import FPGATransformState sdfg_id = sdfg.sdfg_id nesting = NestSDFG(sdfg_id, -1, {}, self.expr_index) nesting.promote_global_trans = self.promote_global_trans nesting.apply(sdfg) fpga_transform = FPGATransformState(sdfg_id, -1, {FPGATransformState._state: 0}, self.expr_index) fpga_transform.apply(sdfg)
class FPGATransformSDFG(transformation.MultiStateTransformation): """ Implements the FPGATransformSDFG transformation, which takes an entire SDFG and transforms it into an FPGA-capable SDFG. """ promote_global_trans = properties.Property( dtype=bool, default=True, desc= "If True, transient arrays that are fully internal are pulled out so " "that they can be allocated on the host.") @staticmethod def annotates_memlets(): return True @classmethod def expressions(cls): # Match anything return [nx.DiGraph()] def can_be_applied(self, graph, expr_index, sdfg, permissive=False): # Avoid import loops from dace.transformation.interstate import FPGATransformState # Condition match depends on matching FPGATransformState for each state for state_id, state in enumerate(sdfg.nodes()): fps = FPGATransformState() fps.setup_match(sdfg, graph.sdfg_id, -1, {FPGATransformState.state: state_id}, 0) if not fps.can_be_applied(sdfg, expr_index, sdfg): return False return True def apply(self, _, sdfg): # Avoid import loops from dace.transformation.interstate import NestSDFG from dace.transformation.interstate import FPGATransformState sdfg_id = sdfg.sdfg_id nesting = NestSDFG() nesting.setup_match(sdfg, sdfg_id, -1, {}, self.expr_index) nesting.promote_global_trans = self.promote_global_trans nesting.apply(sdfg, sdfg) # The state ID is zero since we applied NestSDFG and have only one state in the new SDFG fpga_transform = FPGATransformState() fpga_transform.setup_match(sdfg, sdfg_id, -1, {FPGATransformState.state: 0}, self.expr_index) fpga_transform.apply(sdfg, sdfg)
class Gemm(dace.sdfg.nodes.LibraryNode): """Executes alpha * (A @ B) + beta * C. C should be unidirectionally broadcastable (ONNX terminology) to A @ B. """ # Global properties implementations = { "pure": ExpandGemmPure, "MKL": ExpandGemmMKL, "OpenBLAS": ExpandGemmOpenBLAS, "cuBLAS": ExpandGemmCuBLAS, "PBLAS": ExpandGemmPBLAS, "FPGA1DSystolic": ExpandGemmFPGA1DSystolic } default_implementation = None # Object fields transA = properties.Property( dtype=bool, desc="Whether to transpose A before multiplying") transB = properties.Property( dtype=bool, desc="Whether to transpose B before multiplying") alpha = properties.Property( allow_none=False, default=1, desc="A scalar which will be multiplied with A @ B before adding C") beta = properties.Property( allow_none=False, default=0, desc="A scalar which will be multiplied with C before adding C") def __init__(self, name, location=None, transA=False, transB=False, alpha=1, beta=0): super().__init__( name, location=location, inputs=({"_a", "_b", "_cin"} if beta != 0 else {"_a", "_b"}), outputs={"_c"}) self.transA = transA self.transB = transB self.alpha = alpha self.beta = beta def validate(self, sdfg, state): in_edges = state.in_edges(self) if len(in_edges) not in [2, 3]: raise ValueError("Expected 2 or 3 inputs to gemm") size2 = None for _, _, _, dst_conn, memlet in state.in_edges(self): if dst_conn == '_a': subset = dc(memlet.subset) subset.squeeze() size0 = subset.size() if dst_conn == '_b': subset = dc(memlet.subset) subset.squeeze() size1 = subset.size() if dst_conn == '_c': subset = dc(memlet.subset) subset.squeeze() size2 = subset.size() if self.transA: size0 = list(reversed(size0)) if self.transB: size1 = list(reversed(size1)) out_edges = state.out_edges(self) if len(out_edges) != 1: raise ValueError( "Expected exactly one output from matrix-matrix product") out_memlet = out_edges[0].data # Function is symmetric, edge order does not matter if len(size0) != 2 or len(size1) != 2: raise ValueError( "matrix-matrix product only supported on matrices") if size0[1] != size1[0]: raise ValueError("Inputs to matrix-matrix product " "must agree in the k-dimension") out_subset = dc(out_memlet.subset) out_subset.squeeze() size3 = out_subset.size() if size2 is not None and size2 != size3: raise ValueError("Input C matrix must match output matrix.") if len(size3) != 2: raise ValueError( "matrix-matrix product only supported on matrices") if len(size3) == 2 and list(size3) != [size0[-2], size1[-1]]: raise ValueError( "Output to matrix-matrix product must agree in the m and n " "dimensions")
class StreamingComposition(xf.Transformation): """ Converts two connected computations (nodes, map scopes) into two separate processing elements, with a stream connecting the results. Only applies if the memory access patterns of the two computations match. """ first = xf.PatternNode(nodes.Node) access = xf.PatternNode(nodes.AccessNode) second = xf.PatternNode(nodes.Node) buffer_size = properties.Property( dtype=int, default=1, desc='Set buffer size for the newly-created stream') storage = properties.EnumProperty( dtype=dtypes.StorageType, desc='Set storage type for the newly-created stream', default=dtypes.StorageType.Default) @staticmethod def expressions() -> List[gr.SubgraphView]: return [ sdutil.node_path_graph(StreamingComposition.first, StreamingComposition.access, StreamingComposition.second) ] @staticmethod def can_be_applied(graph: SDFGState, candidate: Dict[xf.PatternNode, int], expr_index: int, sdfg: SDFG, permissive: bool = False) -> bool: access = graph.node(candidate[StreamingComposition.access]) # Make sure the access node is only accessed once (read or write), # and not at the same time if graph.in_degree(access) > 1 or graph.out_degree(access) > 1: return False # If already a stream, skip if isinstance(sdfg.arrays[access.data], data.Stream): return False # Only free nodes are allowed (search up the SDFG tree) curstate = graph node = access while curstate is not None: if curstate.entry_node(node) is not None: return False if curstate.parent.parent_nsdfg_node is None: break node = curstate.parent.parent_nsdfg_node curstate = curstate.parent.parent # Array must not be used anywhere else in the state if any(n is not access and n.data == access.data for n in graph.data_nodes()): return False # Only one memlet path on each direction is allowed # TODO: Relax so that repeated application of # transformation would yield additional streams first_edge = graph.in_edges(access)[0] second_edge = graph.out_edges(access)[0] first_mpath = graph.memlet_path(first_edge) second_mpath = graph.memlet_path(second_edge) if len(first_mpath) != len(list(graph.memlet_tree(first_edge))): return False if len(second_mpath) != len(list(graph.memlet_tree(second_edge))): return False # The innermost ends of the paths must have a clearly defined memory # access pattern and no WCR first_iedge = first_mpath[0] second_iedge = second_mpath[-1] if first_iedge.data.subset.num_elements() != 1: return False if first_iedge.data.volume != 1: return False if first_iedge.data.wcr is not None: return False if second_iedge.data.subset.num_elements() != 1: return False if second_iedge.data.volume != 1: return False ################################################################## # The memory access pattern must be exactly the same # Collect all maps and ranges ranges_first = _collect_map_ranges(graph, first_mpath) ranges_second = _collect_map_ranges(graph, second_mpath) # Check map ranges for (_, frng), (_, srng) in zip(ranges_first, ranges_second): if frng != srng: return False # Check memlets for equivalence if len(first_iedge.data.subset) != len(second_iedge.data.subset): return False if not _do_memlets_correspond(first_iedge.data, second_iedge.data, ranges_first, ranges_second): return False return True def apply(self, sdfg: SDFG) -> nodes.AccessNode: state = sdfg.node(self.state_id) access: nodes.AccessNode = self.access(sdfg) # Get memlet paths first_edge = state.in_edges(access)[0] second_edge = state.out_edges(access)[0] first_mpath = state.memlet_path(first_edge) second_mpath = state.memlet_path(second_edge) # Create new stream of shape 1 desc = sdfg.arrays[access.data] name, newdesc = sdfg.add_stream(access.data, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) # Remove transient array if possible for ostate in sdfg.nodes(): if ostate is state: continue if any(n.data == access.data for n in ostate.data_nodes()): break else: del sdfg.arrays[access.data] # Replace memlets in path with stream access for e in first_mpath: e.data = mm.Memlet(data=name, subset='0') if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) for e in second_mpath: e.data = mm.Memlet(data=name, subset='0') if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace array access node with two stream access nodes wnode = state.add_write(name) rnode = state.add_read(name) state.remove_edge(first_edge) state.add_edge(first_edge.src, first_edge.src_conn, wnode, first_edge.dst_conn, first_edge.data) state.remove_edge(second_edge) state.add_edge(rnode, second_edge.src_conn, second_edge.dst, second_edge.dst_conn, second_edge.data) # Remove original access node state.remove_node(access) return wnode, rnode
class StreamingMemory(xf.Transformation): """ Converts a read or a write to streaming memory access, where data is read/written to/from a stream in a separate connected component than the computation. """ access = xf.PatternNode(nodes.AccessNode) entry = xf.PatternNode(nodes.EntryNode) exit = xf.PatternNode(nodes.ExitNode) buffer_size = properties.Property( dtype=int, default=1, desc='Set buffer size for the newly-created stream') storage = properties.EnumProperty( dtype=dtypes.StorageType, desc='Set storage type for the newly-created stream', default=dtypes.StorageType.Default) @staticmethod def expressions() -> List[gr.SubgraphView]: return [ sdutil.node_path_graph(StreamingMemory.access, StreamingMemory.entry), sdutil.node_path_graph(StreamingMemory.exit, StreamingMemory.access), ] @staticmethod def can_be_applied(graph: SDFGState, candidate: Dict[xf.PatternNode, int], expr_index: int, sdfg: SDFG, permissive: bool = False) -> bool: access = graph.node(candidate[StreamingMemory.access]) # Make sure the access node is only accessed once (read or write), # and not at the same time if graph.out_degree(access) > 0 and graph.in_degree(access) > 0: return False # If already a stream, skip if isinstance(sdfg.arrays[access.data], data.Stream): return False # If does not exist on off-chip memory, skip if sdfg.arrays[access.data].storage not in [ dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.GPU_Global, dtypes.StorageType.FPGA_Global ]: return False # Only free nodes are allowed (search up the SDFG tree) curstate = graph node = access while curstate is not None: if curstate.entry_node(node) is not None: return False if curstate.parent.parent_nsdfg_node is None: break node = curstate.parent.parent_nsdfg_node curstate = curstate.parent.parent # Only one memlet path is allowed per outgoing/incoming edge edges = (graph.out_edges(access) if expr_index == 0 else graph.in_edges(access)) for edge in edges: mpath = graph.memlet_path(edge) if len(mpath) != len(list(graph.memlet_tree(edge))): return False # The innermost end of the path must have a clearly defined memory # access pattern innermost_edge = mpath[-1] if expr_index == 0 else mpath[0] if (innermost_edge.data.subset.num_elements() != 1 or innermost_edge.data.dynamic or innermost_edge.data.volume != 1): return False # Check if any of the maps has a dynamic range # These cases can potentially work but some nodes (and perhaps # tasklets) need to be replicated, which are difficult to track. for pe in mpath: node = pe.dst if expr_index == 0 else graph.entry_node(pe.src) if isinstance( node, nodes.MapEntry) and sdutil.has_dynamic_map_inputs( graph, node): return False # If already applied on this memlet and this is the I/O component, skip if expr_index == 0: other_node = graph.node(candidate[StreamingMemory.entry]) else: other_node = graph.node(candidate[StreamingMemory.exit]) other_node = graph.entry_node(other_node) if other_node.label.startswith('__s'): return False return True def apply(self, sdfg: SDFG) -> nodes.AccessNode: state = sdfg.node(self.state_id) dnode: nodes.AccessNode = self.access(sdfg) if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: name, newdesc = sdfg.add_stream(dnode.data, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map maps.append( state.add_map(f'__s{opname}_{mapname}', [(p, r) for p, r in zip(map.params, map.range)], map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes
class CopyToDevice(pattern_matching.Transformation): """ Implements the copy-to-device transformation, which copies a nested SDFG and its dependencies to a given device. The transformation changes all data storage types of a nested SDFG to the given `storage` property, and creates new arrays and copies around the nested SDFG to that storage. """ _nested_sdfg = nodes.NestedSDFG("", graph.OrderedDiGraph(), {}, {}) storage = properties.Property(dtype=dtypes.StorageType, desc="Nested SDFG storage", choices=dtypes.StorageType, from_string=lambda x: dtypes.StorageType[x], default=dtypes.StorageType.Default) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(CopyToDevice._nested_sdfg)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]] for edge in graph.all_edges(nested_sdfg): # Stream inputs/outputs not allowed path = graph.memlet_path(edge) if ((isinstance(path[0].src, nodes.AccessNode) and isinstance(sdfg.arrays[path[0].src.data], data.Stream)) or (isinstance(path[-1].dst, nodes.AccessNode) and isinstance(sdfg.arrays[path[-1].dst.data], data.Stream))): return False # WCR outputs with arrays are not allowed if (edge.data.wcr is not None and edge.data.subset.num_elements() != 1): return False return True @staticmethod def match_to_str(graph, candidate): nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]] return nested_sdfg.label def apply(self, sdfg): state = sdfg.nodes()[self.state_id] nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]] storage = self.storage created_arrays = set() for _, edge in enumerate(state.in_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data if dataname is None: continue memdata = sdfg.arrays[dataname] name = 'device_' + dataname + '_in' if name not in created_arrays: if isinstance(memdata, data.Array): name, _ = sdfg.add_array( 'device_' + dataname + '_in', shape=[ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) elif isinstance(memdata, data.Scalar): name, _ = sdfg.add_scalar('device_' + dataname + '_in', dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) else: raise NotImplementedError created_arrays.add(name) data_node = nodes.AccessNode(name) to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) from_data_mm.data = name offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] from_data_mm.subset[ind] = (begin, end, step) else: from_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) for _, edge in enumerate(state.out_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data if dataname is None: continue memdata = sdfg.arrays[dataname] name = 'device_' + dataname + '_out' if name not in created_arrays: if isinstance(memdata, data.Array): name, _ = sdfg.add_array( name, shape=[ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) elif isinstance(memdata, data.Scalar): name, _ = sdfg.add_scalar(name, dtype=memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError created_arrays.add(name) data_node = nodes.AccessNode(name) to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) to_data_mm.data = name offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] to_data_mm.subset[ind] = (begin, end, step) else: to_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) # Change storage for all data inside nested SDFG to device. change_storage(nested_sdfg.sdfg, storage)
class BatchedMatMul(dace.sdfg.nodes.LibraryNode): # Global properties implementations = { "pure": ExpandBatchedMatMulPure, "MKL": ExpandBatchedMatMulMKL, "OpenBLAS": ExpandBatchedMatMulOpenBLAS, "cuBLAS": ExpandBatchedMatMulCuBLAS } transA = properties.Property( dtype=bool, desc="Whether to transpose A before multiplying") transB = properties.Property( dtype=bool, desc="Whether to transpose B before multiplying") alpha = properties.Property( allow_none=False, default=1, desc="A scalar which will be multiplied with A @ B before adding C") beta = properties.Property( allow_none=False, default=0, desc="A scalar which will be multiplied with C before adding C") algorithm = properties.Property( dtype=str, allow_none=True, default=None, desc="If applicable, chooses the vendor-provided implementation " "(algorithm) for the multiplication") accumulator_type = properties.TypeClassProperty( default=None, choices=dtypes.Typeclasses, allow_none=True, desc="Accumulator or intermediate storage type used in multiplication") compute_type = properties.Property( default=None, dtype=str, allow_none=True, desc="If applicable, overrides computation type (CUBLAS-specific, see " "``cublasComputeType_t``)") default_implementation = None def __init__(self, name, location=None): super().__init__(name, location=location, inputs={'_a', '_b'}, outputs={'_c'}) def validate(self, sdfg, state): in_edges = state.in_edges(self) if len(in_edges) != 2: raise ValueError( "Expected exactly two inputs to batched matrix-matrix product") for _, _, _, dst_conn, memlet in state.in_edges(self): if dst_conn == '_a': subset = dc(memlet.subset) subset.squeeze() size0 = subset.size() if dst_conn == '_b': subset = dc(memlet.subset) subset.squeeze() size1 = subset.size() out_edges = state.out_edges(self) if len(out_edges) != 1: raise ValueError("Expected exactly one output from " "batched matrix-matrix product") out_memlet = out_edges[0].data # Function is symmetric, edge order does not matter if len(size0) not in [2, 3]: raise ValueError( "Batched matrix-matrix product only supported on matrices") if len(size1) != 3: raise ValueError( "Batched matrix-matrix product only supported on matrices") if size0[-1] != size1[-2]: raise ValueError("Inputs to matrix-matrix product " "must agree in the k-dimension") out_subset = dc(out_memlet.subset) out_subset.squeeze() size2 = out_subset.size() if len(size2) != 3: raise ValueError( "batched matrix-matrix product only supported on matrices")
class Gemv(dace.sdfg.nodes.LibraryNode): # Global properties implementations = { "pure": ExpandGemvPure, "OpenBLAS": ExpandGemvOpenBLAS, "MKL": ExpandGemvMKL, "cuBLAS": ExpandGemvCuBLAS, "FPGA_Accumulate": ExpandGemvFpgaAccumulate, "FPGA_TilesByColumn": ExpandGemvFpgaTilesByColumn, "PBLAS": ExpandGemvPBLAS } default_implementation = None # Object fields alpha = properties.SymbolicProperty(allow_none=False, default=1) beta = properties.SymbolicProperty(allow_none=False, default=0) transA = properties.Property( dtype=bool, desc="Whether to transpose A before multiplying") n = properties.SymbolicProperty(allow_none=True, default=None) m = properties.SymbolicProperty(allow_none=True, default=None) def __init__(self, name, location=None, transA=False, alpha=1, beta=0): super().__init__( name, location=location, inputs={"_A", "_x", "_y"} if beta != 0 else {"_A", "_x"}, outputs={"_y"}) self.transA = transA self.alpha = alpha self.beta = beta def validate(self, sdfg, state): in_edges = state.in_edges(self) if len(in_edges) not in [2, 3]: raise ValueError("Expected 2 or 3 inputs to GEMV") size_y_in = None for _, _, _, dst_conn, memlet in state.in_edges(self): if dst_conn == "_A": subset = copy.deepcopy(memlet.subset) subset.squeeze() size_a = subset.size() if dst_conn == "_x": subset = copy.deepcopy(memlet.subset) subset.squeeze() size_x = subset.size() if dst_conn == "_y": subset = copy.deepcopy(memlet.subset) subset.squeeze() size_y_in = subset.size() if len(size_a) != 2 or len(size_x) != 1: raise ValueError( "Matrix-vector product only supported on matrix-vector input") a_cols = size_a[1] if not self.transA else size_a[0] a_rows = size_a[0] if not self.transA else size_a[1] if a_cols != size_x[0]: raise ValueError(f"Columns of A ({a_cols}) don't match " f"size of x ({size_x[0]}).") out_edges = state.out_edges(self) if len(out_edges) != 1: raise ValueError( "Expected exactly one output from matrix-vector product") out_memlet = out_edges[0].data out_subset = copy.deepcopy(out_memlet.subset) out_subset.squeeze() size_y_out = out_subset.size() if size_y_in is not None and size_y_in != size_y_out: raise ValueError("Input y-vector must match output y-vector.") if (len(size_y_out) != 1 or size_y_out[0] != a_rows): raise ValueError("Vector input to GEMV must match matrix rows.")
class PruneConnectors(pm.SingleStateTransformation, pm.SimplifyPass): """ Removes unused connectors from nested SDFGs, as well as their memlets in the outer scope, replacing them with empty memlets if necessary. Optionally: after pruning, removes the unused containers from parent SDFG. """ nsdfg = pm.PatternNode(nodes.NestedSDFG) remove_unused_containers = properties.Property( dtype=bool, default=False, desc='If True, remove unused containers from parent SDFG.') @classmethod def expressions(cls): return [utils.node_path_graph(cls.nsdfg)] def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissive: bool = False) -> bool: nsdfg = self.nsdfg read_set, write_set = nsdfg.sdfg.read_and_write_sets() prune_in = nsdfg.in_connectors.keys() - read_set prune_out = nsdfg.out_connectors.keys() - write_set # Take into account symbol mappings strs = tuple(nsdfg.symbol_mapping.values()) syms = tuple(symbolic.pystr_to_symbolic(s) for s in strs) symnames = tuple(s.name if hasattr(s, 'name') else '' for s in syms) for conn in list(prune_in): if conn in syms or conn in symnames or conn in nsdfg.sdfg.symbols: prune_in.remove(conn) # Add WCR outputs to "do not prune" input list for e in graph.out_edges(nsdfg): if e.data.wcr is not None and e.src_conn in prune_in: if (graph.in_degree( next( iter(graph.in_edges_by_connector( nsdfg, e.src_conn))).src) > 0): prune_in.remove(e.src_conn) has_before = all( graph.in_degree(graph.memlet_path(e)[0].src) > 0 for e in graph.in_edges(nsdfg) if e.dst_conn in prune_in) has_after = all( graph.out_degree(graph.memlet_path(e)[-1].dst) > 0 for e in graph.out_edges(nsdfg) if e.src_conn in prune_out) if has_before and has_after: return False if len(prune_in) > 0 or len(prune_out) > 0: return True return False def apply(self, state: SDFGState, sdfg: SDFG): nsdfg = self.nsdfg read_set, write_set = nsdfg.sdfg.read_and_write_sets() prune_in = nsdfg.in_connectors.keys() - read_set prune_out = nsdfg.out_connectors.keys() - write_set # Detect which nodes are used, so we can delete unused nodes after the # connectors have been pruned all_data_used = read_set | write_set # Add WCR outputs to "do not prune" input list for e in state.out_edges(nsdfg): if e.data.wcr is not None and e.src_conn in prune_in: if (state.in_degree( next( iter(state.in_edges_by_connector( nsdfg, e.src_conn))).src) > 0): prune_in.remove(e.src_conn) do_not_prune = set() for conn in prune_in: if any( state.in_degree(state.memlet_path(e)[0].src) > 0 for e in state.in_edges(nsdfg) if e.dst_conn == conn): do_not_prune.add(conn) continue for e in state.in_edges_by_connector(nsdfg, conn): state.remove_memlet_path(e, remove_orphans=True) for conn in prune_out: if any( state.out_degree(state.memlet_path(e)[-1].dst) > 0 for e in state.out_edges(nsdfg) if e.src_conn == conn): do_not_prune.add(conn) continue for e in state.out_edges_by_connector(nsdfg, conn): state.remove_memlet_path(e, remove_orphans=True) for conn in prune_in: if conn in nsdfg.sdfg.arrays and conn not in all_data_used and conn not in do_not_prune: # If the data is now unused, we can purge it from the SDFG nsdfg.sdfg.remove_data(conn) for conn in prune_out: if conn in nsdfg.sdfg.arrays and conn not in all_data_used and conn not in do_not_prune: # If the data is now unused, we can purge it from the SDFG nsdfg.sdfg.remove_data(conn) if self.remove_unused_containers: # Remove unused containers from parent SDFGs containers = list(sdfg.arrays.keys()) for name in containers: s = nsdfg.sdfg while s.parent_sdfg: s = s.parent_sdfg try: s.remove_data(name) except ValueError: break
class CopyToDevice(pattern_matching.Transformation): """ Implements the copy-to-device transformation, which copies a nested SDFG and its dependencies to a given device. The transformation changes all data storage types of a nested SDFG to the given `storage` property, and creates new arrays and copies around the nested SDFG to that storage. """ _nested_sdfg = nodes.NestedSDFG("", graph.OrderedDiGraph(), set(), set()) storage = properties.Property( dtype=dtypes.StorageType, desc="Nested SDFG storage", choices=dtypes.StorageType, from_string=lambda x: dtypes.StorageType[x], default=dtypes.StorageType.Default) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [nxutil.node_path_graph(CopyToDevice._nested_sdfg)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]] return nested_sdfg.label def apply(self, sdfg): state = sdfg.nodes()[self.state_id] nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]] storage = self.storage for _, edge in enumerate(state.in_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data memdata = sdfg.arrays[dataname] if isinstance(memdata, data.Array): new_data = sdfg.add_array( 'device_' + dataname + '_in', memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], transient=True, storage=storage) elif isinstance(memdata, data.Scalar): new_data = sdfg.add_scalar( 'device_' + dataname + '_in', memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError data_node = nodes.AccessNode('device_' + dataname + '_in') to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) from_data_mm.data = 'device_' + dataname + '_in' offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] from_data_mm.subset[ind] = (begin, end, step) else: from_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) for _, edge in enumerate(state.out_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data memdata = sdfg.arrays[dataname] if isinstance(memdata, data.Array): new_data = data.Array( 'device_' + dataname + '_out', memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], transient=True, storage=storage) elif isinstance(memdata, data.Scalar): new_data = sdfg.add_scalar( 'device_' + dataname + '_out', memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError data_node = nodes.AccessNode('device_' + dataname + '_out') to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) to_data_mm.data = 'device_' + dataname + '_out' offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] to_data_mm.subset[ind] = (begin, end, step) else: to_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) # Change storage for all data inside nested SDFG to device. change_storage(nested_sdfg.sdfg, storage)
class BankSplit(transformation.SingleStateTransformation): """ A transformation that allow splitting an array and distribute it on another array with one dimension more, or vice versa. Works with arbitrary arrays, but its intended use case is to distribute data on many HBM-banks. Matches any 2 AccessNodes connected by an edge, if the dimensionality of the two accessed arrays differ by exactly one. The sizes of the arrays have to be large enough with respect to the split executed, but this is not verified. While it is allowed to use symbolics for the shapes of the array, it is expected that each dimension is divisible by the number of splits specified. When appling an unrolled map is generated around the accessnodes, which copies the parts of the array to the target array. Examples: Distribute: Suppose for example we copy from A to B, where A has shape [100, 100] and B shape [10, 100, 10]. We can distribute A in that case to B using the transformation by setting split_array_info=[1, 10]. A will then be divided along it's second dimension into 10 parts of size [100, 10] and distributed on B. Gather: Suppose A has shape [4, 50, 50] and B has shape [100, 100]. If one sets split_array_info to [2, 2] and applies the transformation, it will split equally in all dimensions. Therefore A[0] will be copied to B[0:50, 0:50], A[1] to B[0:50, 50:100], A[2] to B[50:100, 0:50] and A[3] to B[50:100, 50:100]. Note that simply reversing the AccessNodes for the arrays in the above examples would have lead to the inverse operation, i.e. the gather would become a distribute and the other way around. """ src_node = transformation.PatternNode(nd.AccessNode) dst_node = transformation.PatternNode(nd.AccessNode) # dtype=List[int] split_array_info = properties.Property( dtype=List, default=None, allow_none=True, desc="Describes how many times this array is split in each dimension, " "where the k-th number describes how many times dimension k is split. " "If the k-th number is 1 this means that the array is not split in " "the k-th dimension at all. " "If None, then the transform will split the first dimension exactly shape[0] times.") default_to_storage = properties.Property( dtype=dtypes.StorageType, default=dtypes.StorageType.CPU_Heap, allow_none=False, desc="The storage type of involved arrays will be set to the value of this property if " "they have Default storage type. ") def _get_split_size(self, virtual_shape: Iterable, split_count: List[int]) -> List[int]: """ :return: the shape of a part-array on one HBMbank """ new_shape_list = [] for d in range(len(virtual_shape)): if split_count[d] != 1: new_shape_list.append(virtual_shape[d] // split_count[d]) else: new_shape_list.append(virtual_shape[d]) return new_shape_list def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissive: bool) -> bool: src = self.src_node dst = self.dst_node src_array = sdfg.arrays[src.data] dst_array = sdfg.arrays[dst.data] plain_array = lambda array: isinstance(array, data.Array) and not isinstance(array, data.View) if not plain_array(src_array): return False if not plain_array(dst_array): return False # same dimensions means HBM-array needs 1 dimension more collect_src = len(src_array.shape) - 1 == len(dst_array.shape) distribute_dst = len(src_array.shape) + 1 == len(dst_array.shape) if collect_src and symbolic.issymbolic(src_array.shape[0], sdfg.constants): return False elif distribute_dst and symbolic.issymbolic(dst_array.shape[0], sdfg.constants): return False return collect_src or distribute_dst @classmethod def expressions(cls): return [utils.node_path_graph(cls.src_node, cls.dst_node)] def apply(self, graph: SDFGState, sdfg: SDFG) -> Union[Any, None]: # Load/parse infos from the SDFG src = self.src_node dst = self.dst_node src_array = sdfg.arrays[src.data] dst_array = sdfg.arrays[dst.data] collect_src = len(src_array.shape) - 1 == len( dst_array.shape) # If this is not true we have to distribute to dst (checked in can_apply) if collect_src: bank_count = int(src_array.shape[0]) true_size = dst_array.shape else: bank_count = int(dst_array.shape[0]) true_size = src_array.shape ndim = len(true_size) # Move Default storage if sdfg.arrays[src.data].storage == dtypes.StorageType.Default: sdfg.arrays[src.data].storage = self.default_to_storage if sdfg.arrays[dst.data].storage == dtypes.StorageType.Default: sdfg.arrays[dst.data].storage = self.default_to_storage # Figure out how to split if self.split_array_info is None: split_info = [1] * ndim split_info[0] = bank_count else: split_info = self.split_array_info if len(split_info) != ndim: raise RuntimeError("Length of split_array_info must match number of " "dimensions") if functools.reduce(lambda a, b: a * b, split_info) != bank_count: raise RuntimeError("Splitting is not possible with the selected splits" "and this number of HBM-banks (required number of banks " "!= actual number of banks)") # create the copy-subgraph ndrange = dict() usable_params = [] for i in range(ndim): usable_params.append(f"i{i}") for i in range(ndim): ndrange[usable_params[i]] = f"0:{split_info[i]}" graph.remove_edge_and_connectors(graph.edges_between(src, dst)[0]) copy_map_enter, copy_map_exit = graph.add_map("hbm_bank_split", ndrange, dtypes.ScheduleType.Unrolled) graph.add_edge(copy_map_enter, None, src, None, memlet.Memlet()) graph.add_edge(dst, None, copy_map_exit, None, memlet.Memlet()) target_size = [str(x) for x in self._get_split_size(true_size, split_info)] target_hbm_bank = [] for i in range(ndim): target_hbm_bank.append(usable_params[i]) for j in range(i): target_hbm_bank[j] = f"{split_info[i]}*{target_hbm_bank[j]}" target_offset = [] for i in range(ndim): target_offset.append(f"{usable_params[i]}*{target_size[i]}") target_size_str = ", ".join([f"{x}:{y}" for x, y in zip([0] * ndim, target_size)]) target_hbm_bank_str = "+ ".join(target_hbm_bank) target_offset_str = ", ".join([f"({x}):({x}+{y})" for x, y in zip(target_offset, target_size)]) if collect_src: copy_memlet = memlet.Memlet(f"{src.data}[{target_hbm_bank_str}, {target_size_str}]->" f"{target_offset_str}") else: copy_memlet = memlet.Memlet(f"{src.data}[{target_offset_str}]->{target_hbm_bank_str}, " f"{target_size_str}") graph.add_edge(src, None, dst, None, copy_memlet)
class WarpTiling(xf.SingleStateTransformation): """ Implements a GPU specialization tiling that takes a GPU kernel map (with nested maps, but without explicit block sizes) and divides its work across a warp. Specifically, it tiles its contents by a configurable warp size (default: 32), and optionally preferring recomputation (map replication) over local storage within the kernel. If write-conflicted reductions happen within the given map, the transformation adds warp reductions to the tiles. """ warp_size = properties.Property(dtype=int, default=32, desc='Hardware warp size') replicate_maps = properties.Property( dtype=bool, default=True, desc='Replicate tiled maps that lead to multiple other tiled maps') mapentry = xf.PatternNode(nodes.MapEntry) @classmethod def expressions(cls): return [sdutil.node_path_graph(cls.mapentry)] def can_be_applied(self, graph: SDFGState, expr_index, sdfg: SDFG, permissive) -> bool: me = self.mapentry if len(xfh.get_internal_scopes(graph, me, immediate=True)) == 0: return False # GPU map that has no predefined thread-block maps return (me.schedule == dtypes.ScheduleType.GPU_Device and not xfh.gpu_map_has_explicit_threadblocks(graph, me)) def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry: me = self.mapentry # Add new map within map mx = graph.exit_node(me) new_me, new_mx = graph.add_map('warp_tile', dict(__tid=f'0:{self.warp_size}'), dtypes.ScheduleType.GPU_ThreadBlock) __tid = symbolic.pystr_to_symbolic('__tid') for e in graph.out_edges(me): xfh.reconnect_edge_through_map(graph, e, new_me, True) for e in graph.in_edges(mx): xfh.reconnect_edge_through_map(graph, e, new_mx, False) # Stride and offset all internal maps maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True) for nstate, nmap in maps_to_stride: nsdfg = nstate.parent nsdfg_node = nsdfg.parent_nsdfg_node # Map cannot be partitioned across a warp if (nmap.range.size()[-1] < self.warp_size) == True: continue if nsdfg is not sdfg and nsdfg_node is not None: nsdfg_node.symbol_mapping['__tid'] = __tid if '__tid' not in nsdfg.symbols: nsdfg.add_symbol('__tid', dtypes.int32) nmap.range[-1] = (nmap.range[-1][0], nmap.range[-1][1] - __tid, nmap.range[-1][2] * self.warp_size) subgraph = nstate.scope_subgraph(nmap) subgraph.replace(nmap.params[-1], f'{nmap.params[-1]} + __tid') inner_map_exit = nstate.exit_node(nmap) # If requested, replicate maps with multiple dependent maps if self.replicate_maps: destinations = [ nstate.memlet_path(edge)[-1].dst for edge in nstate.out_edges(inner_map_exit) ] for dst in destinations: # Transformation will not replicate map with more than one # output if len(destinations) != 1: break if not isinstance(dst, nodes.AccessNode): continue # Not leading to access node if not xfh.contained_in(nstate, dst, new_me): continue # Memlet path goes out of map if not nsdfg.arrays[dst.data].transient: continue # Cannot modify non-transients for edge in nstate.out_edges(dst)[1:]: rep_subgraph = xfh.replicate_scope( nsdfg, nstate, subgraph) rep_edge = nstate.out_edges( rep_subgraph.sink_nodes()[0])[0] # Add copy of data newdesc = copy.deepcopy(sdfg.arrays[dst.data]) newname = nsdfg.add_datadesc(dst.data, newdesc, find_new_name=True) newaccess = nstate.add_access(newname) # Redirect edges xfh.redirect_edge(nstate, rep_edge, new_dst=newaccess, new_data=newname) xfh.redirect_edge(nstate, edge, new_src=newaccess, new_data=newname) # If has WCR, add warp-collaborative reduction on outputs for out_edge in nstate.out_edges(inner_map_exit): dst = nstate.memlet_path(out_edge)[-1].dst if not xfh.contained_in(nstate, dst, new_me): # Skip edges going out of map continue if dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global: # Skip shared memory continue if out_edge.data.wcr is not None: ctype = nsdfg.arrays[out_edge.data.data].dtype.ctype redtype = detect_reduction_type(out_edge.data.wcr) if redtype == dtypes.ReductionType.Custom: raise NotImplementedError credtype = ('dace::ReductionType::' + str(redtype)[str(redtype).find('.') + 1:]) # One element: tasklet if out_edge.data.subset.num_elements() == 1: # Add local access between thread-local and warp reduction name = nsdfg._find_new_name(out_edge.data.data) nsdfg.add_scalar( name, nsdfg.arrays[out_edge.data.data].dtype, transient=True) # Initialize thread-local to global value read = nstate.add_read(out_edge.data.data) write = nstate.add_write(name) edge = nstate.add_nedge(read, write, copy.deepcopy(out_edge.data)) edge.data.wcr = None xfh.state_fission(nsdfg, SubgraphView(nstate, [read, write])) newnode = nstate.add_access(name) nstate.remove_edge(out_edge) edge = nstate.add_edge(out_edge.src, out_edge.src_conn, newnode, None, copy.deepcopy(out_edge.data)) for e in nstate.memlet_path(edge): e.data.data = name e.data.subset = subsets.Range([(0, 0, 1)]) wrt = nstate.add_tasklet( 'warpreduce', {'__a'}, {'__out'}, f'__out = dace::warpReduce<{credtype}, {ctype}>::reduce(__a);', dtypes.Language.CPP) nstate.add_edge(newnode, None, wrt, '__a', Memlet(name)) out_edge.data.wcr = None nstate.add_edge(wrt, '__out', out_edge.dst, None, out_edge.data) else: # More than one element: mapped tasklet # Could be a parallel summation # TODO(later): Check if reduction continue # End of WCR to warp reduction # Make nested SDFG out of new scope xfh.nest_state_subgraph(sdfg, graph, graph.scope_subgraph(new_me, False, False)) return new_me
class StreamingMemory(xf.SingleStateTransformation): """ Converts a read or a write to streaming memory access, where data is read/written to/from a stream in a separate connected component than the computation. If 'use_memory_buffering' is True, the transformation reads/writes data from memory using a wider data format (e.g. 512 bits), and then convert it on the fly to the right data type used by the computation: """ access = xf.PatternNode(nodes.AccessNode) entry = xf.PatternNode(nodes.EntryNode) exit = xf.PatternNode(nodes.ExitNode) buffer_size = properties.Property( dtype=int, default=1, desc='Set buffer size for the newly-created stream') storage = properties.EnumProperty( dtype=dtypes.StorageType, desc='Set storage type for the newly-created stream', default=dtypes.StorageType.Default) use_memory_buffering = properties.Property( dtype=bool, default=False, desc='Set if memory buffering should be used.') memory_buffering_target_bytes = properties.Property( dtype=int, default=64, desc= 'Set bytes read/written from memory if memory buffering is enabled.') @classmethod def expressions(cls) -> List[gr.SubgraphView]: return [ sdutil.node_path_graph(cls.access, cls.entry), sdutil.node_path_graph(cls.exit, cls.access), ] def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissive: bool = False) -> bool: access = self.access # Make sure the access node is only accessed once (read or write), # and not at the same time if graph.out_degree(access) > 0 and graph.in_degree(access) > 0: return False # If already a stream, skip if isinstance(sdfg.arrays[access.data], data.Stream): return False # If does not exist on off-chip memory, skip if sdfg.arrays[access.data].storage not in [ dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.GPU_Global, dtypes.StorageType.FPGA_Global ]: return False # Only free nodes are allowed (search up the SDFG tree) curstate = graph node = access while curstate is not None: if curstate.entry_node(node) is not None: return False if curstate.parent.parent_nsdfg_node is None: break node = curstate.parent.parent_nsdfg_node curstate = curstate.parent.parent # Only one memlet path is allowed per outgoing/incoming edge edges = (graph.out_edges(access) if expr_index == 0 else graph.in_edges(access)) for edge in edges: mpath = graph.memlet_path(edge) if len(mpath) != len(list(graph.memlet_tree(edge))): return False # The innermost end of the path must have a clearly defined memory # access pattern innermost_edge = mpath[-1] if expr_index == 0 else mpath[0] if (innermost_edge.data.subset.num_elements() != 1 or innermost_edge.data.dynamic or innermost_edge.data.volume != 1): return False # Check if any of the maps has a dynamic range # These cases can potentially work but some nodes (and perhaps # tasklets) need to be replicated, which are difficult to track. for pe in mpath: node = pe.dst if expr_index == 0 else graph.entry_node(pe.src) if isinstance( node, nodes.MapEntry) and sdutil.has_dynamic_map_inputs( graph, node): return False # If already applied on this memlet and this is the I/O component, skip if expr_index == 0: other_node = self.entry else: other_node = self.exit other_node = graph.entry_node(other_node) if other_node.label.startswith('__s'): return False ## Check Memory Buffering Properties if self.use_memory_buffering: access = self.access desc = sdfg.arrays[access.data] # Array has to be global array if desc.storage != dtypes.StorageType.FPGA_Global: return False # Type has to divide target bytes if self.memory_buffering_target_bytes % desc.dtype.bytes != 0: return False # Target bytes has to be >= size of data type if self.memory_buffering_target_bytes < desc.dtype.bytes: return False strides = list(desc.strides) # Last stride has to be one if strides[-1] != 1: return False vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) strides.pop() # Remove last element since we already checked it # Other strides have to be divisible by vector size for stride in strides: if is_int(stride) and stride % vector_size != 0: return False # Check if map has the right access pattern # Stride 1 access by innermost loop, innermost loop counter has to be divisible by vector size # Same code as in apply state = sdfg.node(self.state_id) dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) mapping: Dict[ Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy( mpath[-1] if self.expr_index == 0 else mpath[0]) edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] if self.expr_index == 0: map_subset = innermost_edge.src.map.params.copy() ranges = list(innermost_edge.src.map.range) else: map_subset = innermost_edge.dst.map.params.copy() ranges = list(innermost_edge.dst.map.range) # Check is correct access pattern # Correct ranges in map if is_int(ranges[-1] [1]) and (ranges[-1][1] + 1) % vector_size != 0: return False if ranges[-1][2] != 1: return False # Correct access in array if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map_subset[-1]: pass elif isinstance(edge_subset[-1], sympy.core.add.Add): counter: int = 0 for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map_subset[-1]: counter += 1 if counter != 1: return False else: return False return True def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode: dnode: nodes.AccessNode = self.access if self.expr_index == 0: edges = state.out_edges(dnode) else: edges = state.in_edges(dnode) # To understand how many components we need to create, all map ranges # throughout memlet paths must match exactly. We thus create a # dictionary of unique ranges mapping: Dict[Tuple[subsets.Range], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict( list) ranges = {} for edge in edges: mpath = state.memlet_path(edge) ranges[edge] = _collect_map_ranges(state, mpath) mapping[tuple(r[1] for r in ranges[edge])].append(edge) # Collect all edges with the same memory access pattern components_to_create: Dict[ Tuple[symbolic.SymbolicType], List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list) for edges_with_same_range in mapping.values(): for edge in edges_with_same_range: # Get memlet path and innermost edge mpath = state.memlet_path(edge) innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index == 0 else mpath[0]) # Store memlets of the same access in the same component expr = _canonicalize_memlet(innermost_edge.data, ranges[edge]) components_to_create[expr].append((innermost_edge, edge)) components = list(components_to_create.values()) # Split out components that have dependencies between them to avoid # deadlocks if self.expr_index == 0: ccs_to_add = [] for i, component in enumerate(components): edges_to_remove = set() for cedge in component: if any( nx.has_path(state.nx, o[1].dst, cedge[1].dst) for o in component if o is not cedge): ccs_to_add.append([cedge]) edges_to_remove.add(cedge) if edges_to_remove: components[i] = [ c for c in component if c not in edges_to_remove ] components.extend(ccs_to_add) # End of split desc = sdfg.arrays[dnode.data] # Create new streams of shape 1 streams = {} mpaths = {} for edge in edges: if self.use_memory_buffering: arrname = str(self.access) # Add gearbox total_size = edge.data.volume vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) if not is_int(sdfg.arrays[dnode.data].shape[-1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=sdfg.arrays[dnode.data].shape[-1], vec=vector_size)) for i in sdfg.arrays[dnode.data].strides: if not is_int(i): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=i, vec=vector_size)) if self.expr_index == 0: # Read edges = state.out_edges(dnode) gearbox_input_type = dtypes.vector(desc.dtype, vector_size) gearbox_output_type = desc.dtype gearbox_read_volume = total_size / vector_size gearbox_write_volume = total_size else: # Write edges = state.in_edges(dnode) gearbox_input_type = desc.dtype gearbox_output_type = dtypes.vector( desc.dtype, vector_size) gearbox_read_volume = total_size gearbox_write_volume = total_size / vector_size input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream( "gearbox_input", gearbox_input_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream( "gearbox_output", gearbox_output_type, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) read_to_gearbox = state.add_read(input_gearbox_name) write_from_gearbox = state.add_write(output_gearbox_name) gearbox = Gearbox(total_size / vector_size) state.add_node(gearbox) state.add_memlet_path(read_to_gearbox, gearbox, dst_conn="from_memory", memlet=Memlet( input_gearbox_name + "[0]", volume=gearbox_read_volume)) state.add_memlet_path(gearbox, write_from_gearbox, src_conn="to_kernel", memlet=Memlet( output_gearbox_name + "[0]", volume=gearbox_write_volume)) if self.expr_index == 0: streams[edge] = input_gearbox_name name = output_gearbox_name newdesc = output_gearbox_newdesc else: streams[edge] = output_gearbox_name name = input_gearbox_name newdesc = input_gearbox_newdesc else: # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx stream_name = "stream_" + dnode.data name, newdesc = sdfg.add_stream(stream_name, desc.dtype, buffer_size=self.buffer_size, storage=self.storage, transient=True, find_new_name=True) streams[edge] = name # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements output_gearbox_name = name input_gearbox_name = name mpath = state.memlet_path(edge) mpaths[edge] = mpath # Replace memlets in path with stream access for e in mpath: e.data = mm.Memlet(data=name, subset='0', other_subset=e.data.other_subset) if isinstance(e.src, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.src, e.src_conn, newdesc) if isinstance(e.dst, nodes.NestedSDFG): e.data.dynamic = True _streamify_recursive(e.dst, e.dst_conn, newdesc) # Replace access node and memlet tree with one access if self.expr_index == 0: replacement = state.add_read(output_gearbox_name) state.remove_edge(edge) state.add_edge(replacement, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: replacement = state.add_write(input_gearbox_name) state.remove_edge(edge) state.add_edge(edge.src, edge.src_conn, replacement, edge.dst_conn, edge.data) if self.use_memory_buffering: arrname = str(self.access) vector_size = int(self.memory_buffering_target_bytes / desc.dtype.bytes) # Vectorize access to global array. dtype = sdfg.arrays[arrname].dtype sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size) new_shape = list(sdfg.arrays[arrname].shape) contigidx = sdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass sdfg.arrays[arrname].shape = new_shape # Change strides new_strides: List = list(sdfg.arrays[arrname].strides) for i in range(len(new_strides)): if i == len(new_strides ) - 1: # Skip last dimension since it is always 1 continue new_strides[i] = new_strides[i] / vector_size sdfg.arrays[arrname].strides = new_strides post_state = get_post_state(sdfg, state) if post_state != None: # Change subset in the post state such that the correct amount of memory is copied back from the device for e in post_state.edges(): if e.data.data == self.access.data: new_subset = list(e.data.subset) i, j, k = new_subset[-1] new_subset[-1] = (i, (j + 1) / vector_size - 1, k) e.data = mm.Memlet(data=str(e.src), subset=subsets.Range(new_subset)) # Make read/write components ionodes = [] for component in components: # Pick the first edge as the edge to make the component from innermost_edge, outermost_edge = component[0] mpath = mpaths[outermost_edge] mapname = streams[outermost_edge] innermost_edge.data.other_subset = None # Get edge data and streams if self.expr_index == 0: opname = 'read' path = [e.dst for e in mpath[:-1]] rmemlets = [(dnode, '__inp', innermost_edge.data)] wmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_write(name) ionodes.append(ionode) wmemlets.append( (ionode, '__out%d' % i, mm.Memlet(data=name, subset='0'))) code = '\n'.join('__out%d = __inp' % i for i in range(len(component))) else: # More than one input stream might mean a data race, so we only # address the first one in the tasklet code if len(component) > 1: warnings.warn( f'More than one input found for the same index for {dnode.data}' ) opname = 'write' path = [state.entry_node(e.src) for e in reversed(mpath[1:])] wmemlets = [(dnode, '__out', innermost_edge.data)] rmemlets = [] for i, (_, edge) in enumerate(component): name = streams[edge] ionode = state.add_read(name) ionodes.append(ionode) rmemlets.append( (ionode, '__inp%d' % i, mm.Memlet(data=name, subset='0'))) code = '__out = __inp0' # Create map structure for read/write component maps = [] for entry in path: map: nodes.Map = entry.map ranges = [(p, (r[0], r[1], r[2])) for p, r in zip(map.params, map.range)] # Change ranges of map if self.use_memory_buffering: # Find edges from/to map edge_subset = [ a_tuple[0] for a_tuple in list(innermost_edge.data.subset) ] # Change range of map if isinstance(edge_subset[-1], symbol) and str( edge_subset[-1]) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], (ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) elif isinstance(edge_subset[-1], sympy.core.add.Add): for arg in edge_subset[-1].args: if isinstance( arg, symbol) and str(arg) == map.params[-1]: if not is_int(ranges[-1][1][1]): warnings.warn( "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0" .format(sym=ranges[-1][1][1].args[1], vec=vector_size)) ranges[-1] = (ranges[-1][0], ( ranges[-1][1][0], (ranges[-1][1][1] + 1) / vector_size - 1, ranges[-1][1][2])) maps.append( state.add_map(f'__s{opname}_{mapname}', ranges, map.schedule)) tasklet = state.add_tasklet( f'{opname}_{mapname}', {m[1] for m in rmemlets}, {m[1] for m in wmemlets}, code, ) for node, cname, memlet in rmemlets: state.add_memlet_path(node, *(me for me, _ in maps), tasklet, dst_conn=cname, memlet=memlet) for node, cname, memlet in wmemlets: state.add_memlet_path(tasklet, *(mx for _, mx in reversed(maps)), node, src_conn=cname, memlet=memlet) return ionodes