def set_fast_implementations(sdfg: SDFG, device: dtypes.DeviceType, blocklist: List[str] = None): """ Set fast library node implementations for the given device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param blocklist: list of disallowed implementations. :note: Operates in-place on the given SDFG. """ if blocklist is None: implementation_prio = find_fast_library(device) else: implementation_prio = [ i for i in find_fast_library(device) if i not in blocklist ] # specialized nodes: pre-expand for current_sdfg in sdfg.all_sdfgs_recursive(): for state in current_sdfg.nodes(): for node in state.nodes(): if isinstance(node, nodes.LibraryNode): if (node.default_implementation == 'specialize' and (len( set(node.implementations) & set(implementation_prio))) == 0): node.expand(current_sdfg, state) # general nodes for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.LibraryNode): for impl in implementation_prio: if impl in node.implementations: if isinstance(node, dace.libraries.standard.nodes.reduce.Reduce ) and node.implementation == 'CUDA (block)': continue node.implementation = impl break # reduce nodes if device == dtypes.DeviceType.GPU: for node, state in sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.LibraryNode): # Use CUB for device-level reductions if ('CUDA (device)' in node.implementations and not is_devicelevel_gpu(state.parent, state, node) and state.scope_dict()[node] is None): node.implementation = 'CUDA (device)'
def _can_allocate(self, sdfg: SDFG, state: SDFGState, desc: data.Data, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> bool: schedule = self._get_schedule(scope) # if not dtypes.can_allocate(desc.storage, schedule): # return False if dtypes.can_allocate(desc.storage, schedule): return True # Check for device-level memory recursively node = scope if isinstance(scope, nodes.EntryNode) else None cstate = scope if isinstance(scope, SDFGState) else state csdfg = scope if isinstance(scope, SDFG) else sdfg if desc.storage in dtypes.FPGA_STORAGES: return sdscope.is_devicelevel_fpga(csdfg, cstate, node) elif desc.storage in dtypes.GPU_STORAGES: return sdscope.is_devicelevel_gpu(csdfg, cstate, node) return False
def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs): from dace.sdfg.scope import is_devicelevel_gpu if is_devicelevel_gpu(sdfg, state, node): return ExpandGemvPure.expansion(node, state, sdfg) node.validate(sdfg, state) ((edge_a, outer_array_a, shape_a, strides_a), (edge_x, outer_array_x, shape_x, strides_x), (edge_y, outer_array_y, shape_y, strides_y)) = _get_matmul_operands(node, state, sdfg, name_lhs="_A", name_rhs="_x", name_out="_y") dtype_a = outer_array_a.dtype.type dtype = outer_array_x.dtype.base_type veclen = outer_array_x.dtype.veclen m = m or node.m n = n or node.n if m is None: m = shape_y[0] if n is None: n = shape_x[0] transA = node.transA if strides_a[0] == 1: transA = not transA lda = strides_a[1] elif strides_a[1] == 1: lda = strides_a[0] else: warnings.warn('Matrix must be contiguous in at least ' 'one dimension. Falling back to pure expansion.') return ExpandGemvPure.expansion(node, state, sdfg, m=m, n=n, **kwargs) layout = 'CblasColMajor' trans = 'CblasNoTrans' if transA else 'CblasTrans' if not node.transA: m, n = n, m if veclen != 1: warnings.warn('Vector GEMV not supported, falling back to pure.') return ExpandGemvPure.expansion(node, state, sdfg, m=m, n=n, **kwargs) func, ctype, runtimetype = blas_helpers.cublas_type_metadata(dtype) func = func.lower() + 'gemv' code = f"""cblas_{func}({layout}, {trans}, {m}, {n}, {node.alpha}, _A, {lda}, _x, {strides_x[0]}, {node.beta}, _y, {strides_y[0]});""" tasklet = dace.sdfg.nodes.Tasklet(node.name, node.in_connectors, node.out_connectors, code, language=dace.dtypes.Language.CPP) return tasklet
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes: Dict[sd.SDFGState, nodes.Tasklet] = defaultdict(list) for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): # Special case: nodes that lead to top-level dynamic # map ranges must stay on host for e in state.out_edges(node): last_edge = state.memlet_path(e)[-1] if (isinstance(last_edge.dst, nodes.EntryNode) and last_edge.dst_conn and not last_edge.dst_conn.startswith('IN_') and sdict[last_edge.dst] is None): break else: input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append( (e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): if isinstance(inode, data.Scalar): # Scalars can remain on host continue if inode.storage == dtypes.StorageType.GPU_Global: continue newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) cloned_arrays[inodename] = name for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue if onode.storage == dtypes.StorageType.GPU_Global: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True) cloned_arrays[onodename] = name # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(input_nodes): if nname in excluded_copyin or nname not in cloned_arrays: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(output_nodes): if nname in excluded_copyout or nname not in cloned_arrays: continue src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage const_syms = xfh.constant_symbols(sdfg) for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue gpu_storage = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned ] if sdict[ node] is None and nodedesc.storage not in gpu_storage: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops dsyms = set(map(str, nodedesc.free_symbols)) if (self.toplevel_trans and not isinstance(nodedesc, (data.Stream, data.View)) and len(dsyms - const_syms) == 0): nodedesc.lifetime = dtypes.AllocationLifetime.SDFG elif nodedesc.storage not in gpu_storage: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Change all top-level maps and library nodes to GPU schedule for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if sdict[node] is None: if isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)): node.schedule = dtypes.ScheduleType.GPU_Default elif isinstance(node, nodes.EntryNode): node.schedule = dtypes.ScheduleType.GPU_Device elif self.sequential_innermaps: if isinstance(node, (nodes.EntryNode, nodes.LibraryNode)): node.schedule = dtypes.ScheduleType.Sequential elif isinstance(node, nodes.NestedSDFG): for nnode, _ in node.sdfg.all_nodes_recursive(): if isinstance(nnode, (nodes.EntryNode, nodes.LibraryNode)): nnode.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 6: Wrap free tasklets and nested SDFGs with a GPU map # Collect free tasklets for node, state in sdfg.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): if (state.entry_node(node) is None and not scope.is_devicelevel_gpu( state.parent, state, node, with_gpu_default=True)): global_code_nodes[state].append(node) for state, gcodes in global_code_nodes.items(): for gcode in gcodes: if gcode.label in self.exclude_tasklets.split(','): continue # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = {('IN_' + e.dst_conn): None for e in in_edges} me.out_connectors = {('OUT_' + e.dst_conn): None for e in in_edges} mx.in_connectors = {('IN_' + e.src_conn): None for e in out_edges} mx.out_connectors = {('OUT_' + e.src_conn): None for e in out_edges} # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.Memlet()) ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.free_symbols) & set(cloned_arrays.keys())) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): sdutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, sd.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. sdfg.apply_strict_transformations()
def apply(self, state: SDFGState, sdfg: SDFG): adesc = self.a.desc(sdfg) bdesc = self.b.desc(sdfg) edge = state.edges_between(self.a, self.b)[0] if len(adesc.shape) >= len(bdesc.shape): copy_shape = edge.data.get_src_subset(edge, state).size() copy_a = True else: copy_shape = edge.data.get_dst_subset(edge, state).size() copy_a = False maprange = {f'__i{i}': (0, s - 1, 1) for i, s in enumerate(copy_shape)} av = self.a.data bv = self.b.data avnode = self.a bvnode = self.b # Linearize and delinearize to get index expression for other side if copy_a: a_index = [ symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape)) ] b_index = self.delinearize_linearize( bdesc, copy_shape, edge.data.get_dst_subset(edge, state)) else: a_index = self.delinearize_linearize( adesc, copy_shape, edge.data.get_src_subset(edge, state)) b_index = [ symbolic.pystr_to_symbolic(f'__i{i}') for i in range(len(copy_shape)) ] a_subset = subsets.Range([(ind, ind, 1) for ind in a_index]) b_subset = subsets.Range([(ind, ind, 1) for ind in b_index]) # Set schedule based on GPU arrays schedule = dtypes.ScheduleType.Default if adesc.storage == dtypes.StorageType.GPU_Global or bdesc.storage == dtypes.StorageType.GPU_Global: # If already inside GPU kernel if is_devicelevel_gpu(sdfg, state, self.a): schedule = dtypes.ScheduleType.Sequential else: schedule = dtypes.ScheduleType.GPU_Device # Add copy map t, _, _ = state.add_mapped_tasklet( 'copy', maprange, dict(__inp=Memlet(data=av, subset=a_subset)), '__out = __inp', dict(__out=Memlet(data=bv, subset=b_subset)), schedule, external_edges=True, input_nodes={av: avnode}, output_nodes={bv: bvnode}) # Set connector types (due to this transformation appearing in codegen, after connector # types have been resolved) t.in_connectors['__inp'] = adesc.dtype t.out_connectors['__out'] = bdesc.dtype # Remove old edge state.remove_edge(edge)