def write_and_resolve_expr(sdfg, memlet, nc, outname, inname, indices=None): """ Helper function that emits a write_and_resolve call from a memlet. """ redtype = operations.detect_reduction_type(memlet.wcr) nc = "_nc" if nc else "" indstr = (", " + indices) if indices is not None else "" reduction_tmpl = "" custom_reduction = "" # Special call for detected reduction types if redtype != dtypes.ReductionType.Custom: credtype = "dace::ReductionType::" + str( redtype)[str(redtype).find(".") + 1:] reduction_tmpl = "<%s>" % credtype else: custom_reduction = ', %s' % unparse_cr(sdfg, memlet.wcr) return "{oname}.write_and_resolve{nc}{tmpl}({iname}{wcr}{ind});".format( oname=outname, nc=nc, tmpl=reduction_tmpl, iname=inname, wcr=custom_reduction, ind=indstr, )
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(dfg, edge) desc = edge.src.desc(sdfg) if not nc or not isinstance(desc.dtype, (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality wcr = self.cpu_codegen.write_and_resolve_expr(sdfg, edge.data, not nc, None, '@', dtype=desc.dtype) callsite_stream.write(wcr[:wcr.find('@')] + util.REDUCTION_TYPE_TO_SVE[reduction_type] + f'(svptrue_{util.TYPE_TO_SVE_SUFFIX[desc.dtype]}(), ' + src_node.label + wcr[wcr.find('@') + 1:] + ');') return else: ###################### # Horizontal non-atomic reduction raise NotImplementedError() return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def reduction_type(self): # Autodetect reduction type redtype = detect_reduction_type(self.wcr) if redtype not in nutil.NCCL_SUPPORTED_OPERATIONS: raise ValueError( 'NCCL only supports sum, product, min and max operations.') return redtype
def write_and_resolve_expr(self, sdfg, memlet, nc, outname, inname, indices=None, dtype=None): """ Emits a conflict resolution call from a memlet. """ redtype = operations.detect_reduction_type(memlet.wcr) if isinstance(indices, str): ptr = '%s + %s' % (cpp.cpp_ptr_expr(sdfg, memlet), indices) else: ptr = cpp.cpp_ptr_expr(sdfg, memlet, indices=indices) if isinstance(dtype, dtypes.pointer): dtype = dtype.base_type # Special call for detected reduction types if redtype != dtypes.ReductionType.Custom: credtype = "dace::ReductionType::" + str( redtype)[str(redtype).find(".") + 1:] if isinstance(dtype, dtypes.vector): return (f'dace::xilinx_wcr_fixed_vec<{credtype}, ' f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce(' f'{ptr}, {inname})') return ( f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce(' f'{ptr}, {inname})') # General reduction raise NotImplementedError('General reductions not yet implemented')
def expansion(node, state, sdfg): a, b, c = _get_matmul_operands(node, state, sdfg) size_a = a[2] size_b = b[2] if len(size_a) == 2 and len(size_b) == 2: # Matrix and matrix -> GEMM from dace.libraries.blas.nodes.gemm import Gemm beta = 0.0 if c[0].data.wcr: from dace.frontend import operations redtype = operations.detect_reduction_type(c[0].data.wcr) if redtype == dace.dtypes.ReductionType.Sum: beta = 1.0 else: warnings.warn("Unsupported WCR in output of MatMul " "library node: {}".format(c[0].data.wcr)) gemm = Gemm(node.name + 'gemm', location=node.location, alpha=1.0, beta=beta) return gemm elif len(size_b) == 3 and (len(size_a) in [2, 3]): # Batched matrix and matrix -> batched matrix multiplication from dace.libraries.blas.nodes.batched_matmul import BatchedMatMul batched = BatchedMatMul(node.name + 'bmm', location=node.location) return batched elif len(size_a) == 2 and len(size_b) == 1: # Matrix and vector -> GEMV from dace.libraries.blas.nodes.gemv import Gemv # Rename inputs to match dot naming a[0].dst_conn = "_A" b[0].dst_conn = "_x" c[0].src_conn = "_y" gemv = Gemv(node.name + 'gemv', location=node.location) return gemv elif len(size_a) == 1 and len(size_b) == 2: # Vector and matrix -> GEMV with transposed matrix from dace.libraries.blas.nodes.gemv import Gemv # Rename inputs to match dot naming a[0].dst_conn = "_x" b[0].dst_conn = "_A" c[0].src_conn = "_y" gemv = Gemv(node.name + 'gemvt', location=node.location, transA=True) return gemv elif len(size_a) == 1 and len(size_b) == 1: # Vector and vector -> dot product from dace.libraries.blas.nodes.dot import Dot # Rename inputs to match dot naming a[0].dst_conn = "_x" b[0].dst_conn = "_y" c[0].src_conn = "_result" dot = Dot(node.name + 'dot', location=node.location) return dot else: raise NotImplementedError("Matrix multiplication not implemented " "for shapes: {} and {}".format( size_a, size_b))
def apply(self, sdfg): graph = sdfg.node(self.state_id) # Avoid import loop from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage._node_a: self.subgraph[AccumulateTransient._map_exit], LocalStorage._node_b: self.subgraph[AccumulateTransient._outer_map_exit] } sdfg_id = sdfg.sdfg_list.index(sdfg) in_local_storage = LocalStorage( sdfg_id, self.state_id, local_storage_subgraph, self.expr_index) in_local_storage.array = self.array in_local_storage.apply(sdfg) # Initialize transient to zero in case of summation # TODO: Initialize transient in other WCR types memlet = graph.in_edges(in_local_storage._data_node)[0].data if detect_reduction_type(memlet.wcr) == dtypes.ReductionType.Sum: in_local_storage._data_node.setzero = True else: warnings.warn('AccumulateTransient did not properly initialize' 'newly-created transient!')
def can_be_applied(graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] # Check if there is more than one GPU available: if (Config.get("compiler", "cuda", "max_number_gpus") < 2): return False # Dynamic map ranges not supported if has_dynamic_map_inputs(graph, map_entry): return False # Only accept maps with a default schedule schedule_whitelist = [dtypes.ScheduleType.Default] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Library nodes inside the scope are not supported scope_subgraph = graph.scope_subgraph(map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): return False # Custom reductions can not have an accumulate transient, as the # reduction would have to be split up for the ingoing memlet of the # accumulate transient and the outgoing memlet. Not using GPU local # accumulate transient only works for a small volume of data. map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): if edge.data.wcr is not None and operations.detect_reduction_type( edge.data.wcr) == dtypes.ReductionType.Custom: return False storage_whitelist = [ dtypes.StorageType.Default, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.CPU_Heap, dtypes.StorageType.GPU_Global, ] for node in graph.predecessors(map_entry): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False for node in graph.successors(map_exit): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False return True
def __label__(self, sdfg, state): # Autodetect reduction type redtype = detect_reduction_type(self.wcr) if redtype == types.ReductionType.Custom: wcrstr = unparse(ast.parse(self.wcr).body[0].value.body) else: wcrstr = str(redtype) wcrstr = wcrstr[wcrstr.find('.') + 1:] # Skip "ReductionType." return 'Op: {op}\nAxes: {axes}'.format( axes=('all' if self.axes is None else str(self.axes)), op=wcrstr)
def write_and_resolve_expr(self, sdfg, memlet, nc, outname, inname, indices=None, dtype=None): """ Emits a conflict resolution call from a memlet. """ redtype = operations.detect_reduction_type(memlet.wcr, openmp=True) defined_type, _ = self._dispatcher.defined_vars.get(memlet.data) if isinstance(indices, str): ptr = '%s + %s' % (cpp.cpp_ptr_expr( sdfg, memlet, defined_type, is_write=True), indices) else: ptr = cpp.cpp_ptr_expr(sdfg, memlet, defined_type, indices=indices, is_write=True) if isinstance(dtype, dtypes.pointer): dtype = dtype.base_type # Special call for detected reduction types if redtype != dtypes.ReductionType.Custom: if redtype == dace.dtypes.ReductionType.Sub: # write this as an addition credtype = "dace::ReductionType::Sum" is_sub = True else: credtype = "dace::ReductionType::" + str( redtype)[str(redtype).find(".") + 1:] is_sub = False if isinstance(dtype, dtypes.vector): return (f'dace::xilinx_wcr_fixed_vec<{credtype}, ' f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce(' f'{ptr}, {"-" if is_sub else ""}{inname})') return ( f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce(' f'{ptr}, {"-" if is_sub else ""}{inname})') # General reduction raise NotImplementedError('General reductions not yet implemented')
def _label(self, shape): result = '' if self.data is not None: result = self.data if self.subset is None: return result num_elements = self.subset.num_elements() if self.num_accesses != num_elements: if self.num_accesses == -1: result += '(dyn) ' else: result += '(%s) ' % SymbolicProperty.to_string( self.num_accesses) arrayNotation = True try: if shape is not None and reduce(operator.mul, shape, 1) == 1: # Don't draw array if we're accessing a single element and it's zero if all(s == 0 for s in self.subset.min_element()): arrayNotation = False except TypeError: # Will fail if trying to check the truth value of a sympy expr pass if arrayNotation: result += '[%s]' % str(self.subset) if self.wcr is not None and str(self.wcr) != '': # Autodetect reduction type redtype = detect_reduction_type(self.wcr) if redtype == dtypes.ReductionType.Custom: wcrstr = unparse(ast.parse(self.wcr).body[0].value.body) else: wcrstr = str(redtype) wcrstr = wcrstr[wcrstr.find('.') + 1:] # Skip "ReductionType." result += ' (CR: %s' % wcrstr if self.wcr_identity is not None: result += ', id: %s' % str(self.wcr_identity) result += ')' if self.other_subset is not None: result += ' -> [%s]' % str(self.other_subset) return result
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() output_memlet = output_edge.data # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Get reduction type for OpenMP redtype = detect_reduction_type(node.wcr, openmp=True) if redtype not in ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP: raise ValueError('Reduction type not supported for "%s"' % node.wcr) omptype, expr = ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP[redtype] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] outer_loops = len(axes) != input_dims # Create OpenMP clause if outer_loops: code = '#pragma omp parallel for collapse({cdim})\n'.format( cdim=output_dims) else: code = '' from dace.codegen.targets.cpp import sym2cpp # Output loops out_offset = [] if outer_loops: for i, sz in enumerate(outedge.data.subset.size()): code += 'for (int _o{i} = 0; _o{i} < {sz}; ++_o{i}) {{\n'.format( i=i, sz=sym2cpp(sz)) out_offset.append('_o%d * %s' % (i, sym2cpp(output_data.strides[i]))) else: out_offset.append('0') outexpr = '_out[%s]' % ' + '.join(out_offset) # Write identity value first if node.identity is not None: code += '%s = %s;\n' % (outexpr, node.identity) # Reduction OpenMP clause code += '#pragma omp parallel for collapse({cdim}) ' \ 'reduction({rtype}: {oexpr})\n'.format(cdim=len(axes), rtype=omptype, oexpr=outexpr) # Reduction loops for i, axis in enumerate(sorted(axes)): sz = sym2cpp(inedge.data.subset.size()[axis]) code += 'for (int _i{i} = 0; _i{i} < {sz}; ++_i{i}) {{\n'.format( i=i, sz=sz) # Prepare input offset expression in_offset = [] ictr, octr = 0, 0 for i in range(input_dims): if i in axes: result = '_i%d' % ictr ictr += 1 else: result = '_o%d' % octr octr += 1 in_offset.append('%s * %s' % (result, sym2cpp(input_data.strides[i]))) in_offset = ' + '.join(in_offset) # Reduction expression code += expr.format(i='_in[%s]' % in_offset, o=outexpr) code += '\n' # Closing braces code += '}\n' * len(axes) if outer_loops: code += '}\n' * output_dims # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, code, language=dace.Language.CPP) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG, partial_width=16): ''' :param node: the node to expand :param state: the state in which the node is in :param sdfg: the SDFG in which the node is in :param partial_width: Width of the inner reduction buffer. Must be larger than the latency of the reduction operation on the given data type ''' node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if input_data.dtype.veclen > 1: raise NotImplementedError( 'Vectorization currently not implemented for FPGA expansion of Reduce.' ) nstate = nsdfg.add_state() # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: all_axis = False # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append(f'_i{ictr}') ictr += 1 else: input_subset.append(f'_o{octr}') octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { f'_o{i}': f'0:{symstr(sz)}' for i, sz in enumerate(outedge.data.subset.size()) }) outm_idx = ','.join([f'_o{i}' for i in range(output_dims)]) outm = dace.Memlet(f'_out[{outm_idx}]') inm_idx = ','.join(input_subset) inmm = dace.Memlet(f'_in[{inm_idx}]') else: all_axis = True ome, omx = None, None outm = dace.Memlet('_out[0]') inm_idx = ','.join([f'_i{i}' for i in range(len(axes))]) inmm = dace.Memlet(f'_in[{inm_idx}]') # Add inner map, which corresponds to the range to reduce r = nstate.add_read('_in') w = nstate.add_read('_out') # TODO support vectorization buffer_name = 'partial_results' nsdfg.add_array(buffer_name, (partial_width, ), input_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) buffer = nstate.add_access(buffer_name) buffer_write = nstate.add_write(buffer_name) # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width init_me, init_mx = nstate.add_map( 'partial_results_init', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'}, f'pr_out = {node.identity}') nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet()) nstate.add_memlet_path(init_tasklet, init_mx, buffer, src_conn='pr_out', memlet=dace.Memlet(f'{buffer_name}[i]')) if not all_axis: nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet()) ime, imx = nstate.add_map( 'reduce_values', { f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}' for i, axis in enumerate(sorted(axes)) }) # Accumulate over partial results redtype = detect_reduction_type(node.wcr) if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR: raise ValueError('Reduction type not supported for "%s"' % node.wcr) else: reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[ redtype] # generate flatten index considering inner map: will be used for indexing into partial results ranges_size = ime.range.size() inner_index = '+'.join( [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)]) inner_op = ' + ' if len(axes) > 1 else '' inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}' partial_reduce_tasklet = nstate.add_tasklet( 'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\ prev = buffer_in buffer_out = {reduction_expr}''') if not all_axis: # Connect input and partial sums nstate.add_memlet_path(r, ome, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) else: nstate.add_memlet_path(r, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) nstate.add_memlet_path( buffer, ime, partial_reduce_tasklet, dst_conn='buffer_in', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) nstate.add_memlet_path( partial_reduce_tasklet, imx, buffer_write, src_conn='buffer_out', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) # Then perform reduction on partial results reduce_entry, reduce_exit = nstate.add_map( 'reduce', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) reduce_tasklet = nstate.add_tasklet( 'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\ prev = reduce_in if i > 0 else {node.identity} reduce_out = {reduction_expr}''') nstate.add_memlet_path(buffer_write, reduce_entry, reduce_tasklet, dst_conn='data_in', memlet=dace.Memlet(f'{buffer_name}[i]')) reduce_name = 'reduce_result' nsdfg.add_array(reduce_name, (1, ), output_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) reduce_read = nstate.add_access(reduce_name) reduce_access = nstate.add_access(reduce_name) if not all_axis: nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet()) nstate.add_memlet_path(reduce_read, reduce_entry, reduce_tasklet, dst_conn='reduce_in', memlet=dace.Memlet(f'{reduce_name}[0]')) nstate.add_memlet_path(reduce_tasklet, reduce_exit, reduce_access, src_conn='reduce_out', memlet=dace.Memlet(f'{reduce_name}[0]')) if not all_axis: # Write out the result nstate.add_memlet_path(reduce_access, omx, w, memlet=outm) else: nstate.add_memlet_path(reduce_access, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg.validate() return nsdfg
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ out_storage_node = graph.out_edges(reduce_node)[0].dst in_storage_node = graph.in_edges(reduce_node)[0].src wcr = reduce_node.wcr identity = reduce_node.identity schedule = reduce_node.schedule implementation = reduce_node.implementation if implementation and 'warp' in implementation: raise NotImplementedError( "WIP: Warp Reductions are not Implemented yet.") # remove the reduce identity # we will reassign it later after expanding reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which # we perform the reduction: # do BFS, best complexity O(V+E) queue = [nsdfg] array_closest_ancestor = None while len(queue) > 0: current = queue.pop(0) if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break queue.extend([in_edge.src for in_edge in graph.in_edges(current)]) # if ancestor doesn't exist: # if non-transient: create data node accessing it # if transient: ancestor_node = none, set_zero on outer node shortcut = False if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \ or identity is not None: if self.debug: print("ReduceExpansion::Expanding Reduction into Map") # we are lucky shortcut = True nstate.out_edges(outer_exit)[0].data.wcr = None else: if self.debug: print("ReduceExpansion::Expanding Reduction into Map " "and introducing update Tasklet, " "connecting with ancestor.") if not array_closest_ancestor: array_closest_ancestor = nodes.AccessNode( out_storage_node.data, access=dtypes.AccessType.ReadOnly) graph.add_node(array_closest_ancestor) # array_closest_ancestor now points to the node we want to connect # to the map entry # always have to create out transient in this case self.create_out_transient = True if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to shared memory / default nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # first, inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) if not shortcut: reduction_type = detect_reduction_type(wcr) try: code = ReduceExpansion.reduction_type_update[reduction_type] except KeyError: raise NotImplementedError( "Not yet implemented for custom reduction") new_tasklet = graph.add_tasklet( name="reduction_transient_update", inputs={"reduction_in", "array_in"}, outputs={"out"}, code=code) edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \ if self.create_out_transient \ else graph.out_edges(inner_exit)[0] new_memlet_array_inner = Memlet(data=out_storage_node.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_array_outer = Memlet( data=array_closest_ancestor.data, volume=graph.in_edges(outer_entry)[0].data.volume, subset=subsets.Range.from_array( sdfg.data(out_storage_node.data))) new_memlet_reduction = Memlet( data=graph.out_edges(inner_exit)[0].data.data, volume=1, subset=graph.out_edges(inner_exit)[0].data.subset) new_memlet_out_inner = Memlet(data=edge_to_remove.data.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_out_outer = dcpy(new_memlet_array_outer) # remove old edges outer_edge_to_remove = None for edge in graph.out_edges(outer_exit): if edge.src == edge_to_remove.dst: outer_edge_to_remove = edge graph.remove_edge_and_connectors(edge_to_remove) graph.remove_edge_and_connectors(outer_edge_to_remove) graph.add_edge(out_transient_node_inner if self.create_out_transient \ else inner_exit, None, new_tasklet, "reduction_in", new_memlet_reduction) graph.add_edge(outer_entry, None, new_tasklet, "array_in", new_memlet_array_inner) graph.add_edge(array_closest_ancestor, None, outer_entry, None, new_memlet_array_outer) graph.add_edge(new_tasklet, "out", outer_exit, None, new_memlet_out_inner) graph.add_edge(outer_exit, None, out_storage_node, None, new_memlet_out_outer) # fill map scope connectors graph.fill_scope_connectors() graph._clear_scopedict_cache() # wcr is already removed # FORNOW: choose default schedule and implementation new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._new_reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def tile_wcrs(graph_or_subgraph: GraphViewType, validate_all: bool, prefer_partial_parallelism: bool = None) -> None: """ Tiles parallel write-conflict resolution maps in an SDFG, state, or subgraphs thereof. Reduces the number of atomic operations by tiling and introducing transient arrays to accumulate atomics on. :param graph_or_subgraph: The SDFG/state/subgraph to optimize within. :param validate_all: If True, runs SDFG validation after every tiling. :param prefer_partial_parallelism: If set, prefers extracting non-conflicted map dimensions over tiling WCR map (may not perform well if parallel dimensions are small). :note: This function operates in-place. """ # Avoid import loops from dace.codegen.targets import cpp from dace.frontend import operations from dace.transformation import dataflow, helpers as xfh # Determine on which nodes to run the operation graph = graph_or_subgraph if isinstance(graph_or_subgraph, gr.SubgraphView): graph = graph_or_subgraph.graph if isinstance(graph, SDFG): for state in graph_or_subgraph.nodes(): tile_wcrs(state, validate_all) return if not isinstance(graph, SDFGState): raise TypeError( 'Graph must be a state, an SDFG, or a subgraph of either') sdfg = graph.parent edges_to_consider: Set[Tuple[gr.MultiConnectorEdge[Memlet], nodes.MapEntry]] = set() for edge in graph_or_subgraph.edges(): if edge.data.wcr is not None: if (isinstance(edge.src, (nodes.MapExit, nodes.NestedSDFG)) or isinstance(edge.dst, nodes.MapEntry)): # Do not consider intermediate edges continue reason = cpp.is_write_conflicted_with_reason(graph, edge) if reason is None or not isinstance(reason, nodes.MapEntry): # Do not consider edges that will not generate atomics or # atomics we cannot transform continue if reason not in graph_or_subgraph.nodes(): # Skip if conflict exists outside of nested SDFG continue # Check if identity value can be inferred redtype = operations.detect_reduction_type(edge.data.wcr) dtype = sdfg.arrays[edge.data.data].dtype identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue edges_to_consider.add((edge, reason)) tile_size = config.Config.get('optimizer', 'autotile_size') debugprint = config.Config.get_bool('debugprint') if prefer_partial_parallelism is None: prefer_partial_parallelism = config.Config.get_bool( 'optimizer', 'autotile_partial_parallelism') maps_to_consider: Set[nodes.MapEntry] = set(me for _, me in edges_to_consider) transformed: Set[nodes.MapEntry] = set() # Heuristic: If the map is only partially conflicted, extract # parallel dimensions instead of tiling if prefer_partial_parallelism: for mapentry in maps_to_consider: # Check the write-conflicts of all WCR edges in map conflicts: Set[str] = set() for edge, me in edges_to_consider: if me is not mapentry: continue conflicts |= set( cpp.write_conflicted_map_params(mapentry, edge)) nonconflicted_dims = set(mapentry.params) - conflicts if nonconflicted_dims: dims = [ i for i, p in enumerate(mapentry.params) if p in nonconflicted_dims ] if ((dt._prod(s for i, s in enumerate(mapentry.range.size()) if i in dims) < tile_size) == True): # Map has a small range, extracting parallelism may not be # beneficial continue xfh.extract_map_dims(sdfg, mapentry, dims) transformed.add(mapentry) # Tile and accumulate other not-transformed maps for edge, mapentry in edges_to_consider: if mapentry in transformed: continue transformed.add(mapentry) # NOTE: The test "(x < y) == True" below is crafted for SymPy # to be "definitely True" if all((s < tile_size) == True for s in mapentry.map.range.size()): # If smaller than tile size, don't transform and instead # make map sequential if debugprint: print(f'Making map "{mapentry}" sequential due to being ' 'smaller than tile size') mapentry.map.schedule = dtypes.ScheduleType.Sequential continue # MapTiling -> AccumulateTransient / AccumulateStream outer_mapentry = dataflow.MapTiling.apply_to( sdfg, dict(tile_sizes=(tile_size, )), map_entry=mapentry) # Transform all outgoing WCR and stream edges mapexit = graph.exit_node(mapentry) outer_mapexit = graph.exit_node(outer_mapentry) # Tuple of (transformation type, options, pattern) to_apply: Tuple[Union[dataflow.StreamTransient, dataflow.AccumulateTransient], Dict[str, Any], Dict[str, nodes.Node]] = None for e in graph.out_edges(mapexit): if isinstance(sdfg.arrays[e.data.data], dt.Stream): mpath = graph.memlet_path(e) tasklet = mpath[0].src if not isinstance(tasklet, nodes.Tasklet) or len(mpath) != 3: # TODO(later): Implement StreamTransient independently of tasklet continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.StreamTransient, {}, dict(tasklet=tasklet, map_exit=mapexit, outer_map_exit=outer_mapexit)) else: if (e.data.is_empty() or e.data.wcr is None or e.data.wcr_nonatomic or (e.data.dst_subset is not None and e.data.dst_subset.num_elements() > 0 and e.data.dynamic)): continue dtype = sdfg.arrays[e.data.data].dtype redtype = operations.detect_reduction_type(e.data.wcr) identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.AccumulateTransient, dict(identity=identity, array=e.data.data), dict(map_exit=mapexit, outer_map_exit=outer_mapexit)) if to_apply is not None: xform, opts, pattern = to_apply xform.apply_to(sdfg, options=opts, **pattern) if debugprint and len(transformed) > 0: print(f'Optimized {len(transformed)} write-conflicted maps')
def vector_reduction_expr(self, edge, dtype, rhs): # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(self.dfg, edge) if not nc or not isinstance(edge.src.out_connectors[edge.src_conn], (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality dst_node = self.dfg.memlet_path(edge)[-1].dst if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc( self.sdfg).storage == dtypes.StorageType.SVE_Register): return wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg, edge.data, not nc, None, '@', dtype=dtype) self.fill(wcr[:wcr.find('@')]) self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type]) self.write('(') self.write(self.pred_name) self.write(', ') self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write(')') self.write(wcr[wcr.find('@') + 1:]) self.write(';') else: ###################### # Horizontal non-atomic reduction stride = edge.data.get_stride(self.sdfg, self.map) # long long fix ptr_cast = '' src_type = edge.src.out_connectors[edge.src_conn] if src_type.type == np.int64: ptr_cast = '(int64_t*) ' elif src_type.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( self.pred_name, ptr_cast + cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer), ) red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x' if stride == 1: self.write( f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));') else: store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})' self.write( f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));')
def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): scope = util.get_sve_scope(sdfg, dfg, src_node) if scope is None: raise NotImplementedError('Not in an SVE scope') out_conn = src_node.out_connectors[edge.src_conn] if out_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {out_conn.type} not supported') if edge.data.wcr is None: # No WCR required if isinstance(dst_node, dace.nodes.Tasklet): # Writeback into a tasklet is just writing into the shared register callsite_stream.write(f'{edge.data.data} = {edge.src_conn};') return if isinstance(out_conn, dtypes.vector): # If no WCR, we can directly store the vector (SVE register) in memory # Determine the stride of the store and use a scatter load if applicable stride = self.get_load_stride(sdfg, dfg, src_node, edge.data) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif out_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, src_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer), ) if stride == 1: callsite_stream.write( f'svst1({store_args}, {edge.src_conn});') else: callsite_stream.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});' ) else: raise NotImplementedError('Writeback into non-vector') else: # TODO: Check what are we WCR'ing in? # Since we have WCR, we must determine a suitable SVE reduce instruction # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') # If the memlet contains the innermost SVE param, we have a problem, because # SVE doesn't support WCR stores. This would require unrolling the loop. if scope.params[-1] in edge.data.free_symbols: raise util.NotSupportedError( 'SVE loop param used in WCR memlet') # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality sve_reduction = '{}({}, {})'.format( util.REDUCTION_TYPE_TO_SVE[reduction_type], util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(long long*) ' elif out_conn.type == np.uint64: ptr_cast = '(unsigned long long*) ' wcr_expr = self.cpu_codegen.write_and_resolve_expr( sdfg, edge.data, edge.data.wcr_nonatomic, None, ptr_cast + sve_reduction, dtype=out_conn.vtype) callsite_stream.write(wcr_expr + ';')
def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry: me = self.mapentry # Add new map within map mx = graph.exit_node(me) new_me, new_mx = graph.add_map('warp_tile', dict(__tid=f'0:{self.warp_size}'), dtypes.ScheduleType.GPU_ThreadBlock) __tid = symbolic.pystr_to_symbolic('__tid') for e in graph.out_edges(me): xfh.reconnect_edge_through_map(graph, e, new_me, True) for e in graph.in_edges(mx): xfh.reconnect_edge_through_map(graph, e, new_mx, False) # Stride and offset all internal maps maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True) for nstate, nmap in maps_to_stride: nsdfg = nstate.parent nsdfg_node = nsdfg.parent_nsdfg_node # Map cannot be partitioned across a warp if (nmap.range.size()[-1] < self.warp_size) == True: continue if nsdfg is not sdfg and nsdfg_node is not None: nsdfg_node.symbol_mapping['__tid'] = __tid if '__tid' not in nsdfg.symbols: nsdfg.add_symbol('__tid', dtypes.int32) nmap.range[-1] = (nmap.range[-1][0], nmap.range[-1][1] - __tid, nmap.range[-1][2] * self.warp_size) subgraph = nstate.scope_subgraph(nmap) subgraph.replace(nmap.params[-1], f'{nmap.params[-1]} + __tid') inner_map_exit = nstate.exit_node(nmap) # If requested, replicate maps with multiple dependent maps if self.replicate_maps: destinations = [ nstate.memlet_path(edge)[-1].dst for edge in nstate.out_edges(inner_map_exit) ] for dst in destinations: # Transformation will not replicate map with more than one # output if len(destinations) != 1: break if not isinstance(dst, nodes.AccessNode): continue # Not leading to access node if not xfh.contained_in(nstate, dst, new_me): continue # Memlet path goes out of map if not nsdfg.arrays[dst.data].transient: continue # Cannot modify non-transients for edge in nstate.out_edges(dst)[1:]: rep_subgraph = xfh.replicate_scope( nsdfg, nstate, subgraph) rep_edge = nstate.out_edges( rep_subgraph.sink_nodes()[0])[0] # Add copy of data newdesc = copy.deepcopy(sdfg.arrays[dst.data]) newname = nsdfg.add_datadesc(dst.data, newdesc, find_new_name=True) newaccess = nstate.add_access(newname) # Redirect edges xfh.redirect_edge(nstate, rep_edge, new_dst=newaccess, new_data=newname) xfh.redirect_edge(nstate, edge, new_src=newaccess, new_data=newname) # If has WCR, add warp-collaborative reduction on outputs for out_edge in nstate.out_edges(inner_map_exit): dst = nstate.memlet_path(out_edge)[-1].dst if not xfh.contained_in(nstate, dst, new_me): # Skip edges going out of map continue if dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global: # Skip shared memory continue if out_edge.data.wcr is not None: ctype = nsdfg.arrays[out_edge.data.data].dtype.ctype redtype = detect_reduction_type(out_edge.data.wcr) if redtype == dtypes.ReductionType.Custom: raise NotImplementedError credtype = ('dace::ReductionType::' + str(redtype)[str(redtype).find('.') + 1:]) # One element: tasklet if out_edge.data.subset.num_elements() == 1: # Add local access between thread-local and warp reduction name = nsdfg._find_new_name(out_edge.data.data) nsdfg.add_scalar( name, nsdfg.arrays[out_edge.data.data].dtype, transient=True) # Initialize thread-local to global value read = nstate.add_read(out_edge.data.data) write = nstate.add_write(name) edge = nstate.add_nedge(read, write, copy.deepcopy(out_edge.data)) edge.data.wcr = None xfh.state_fission(nsdfg, SubgraphView(nstate, [read, write])) newnode = nstate.add_access(name) nstate.remove_edge(out_edge) edge = nstate.add_edge(out_edge.src, out_edge.src_conn, newnode, None, copy.deepcopy(out_edge.data)) for e in nstate.memlet_path(edge): e.data.data = name e.data.subset = subsets.Range([(0, 0, 1)]) wrt = nstate.add_tasklet( 'warpreduce', {'__a'}, {'__out'}, f'__out = dace::warpReduce<{credtype}, {ctype}>::reduce(__a);', dtypes.Language.CPP) nstate.add_edge(newnode, None, wrt, '__a', Memlet(name)) out_edge.data.wcr = None nstate.add_edge(wrt, '__out', out_edge.dst, None, out_edge.data) else: # More than one element: mapped tasklet # Could be a parallel summation # TODO(later): Check if reduction continue # End of WCR to warp reduction # Make nested SDFG out of new scope xfh.nest_state_subgraph(sdfg, graph, graph.scope_subgraph(new_me, False, False)) return new_me
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ # get out storage node, might be hidden behind view node out_data = graph.out_edges(reduce_node)[0].data out_storage_node = reduce_node while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst if isinstance(sdfg.data(out_storage_node.data), View): out_storage_node = graph.out_edges(out_storage_node)[0].dst while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst # get other useful quantities from the original reduce node wcr = reduce_node.wcr identity = reduce_node.identity implementation = reduce_node.implementation # remove the reduce identity, will get reassigned after expansion reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS if self.create_out_transient: queue = [nsdfg] enqueued = set() array_closest_ancestor = None while len(queue) > 0: current = queue.pop() if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break for in_edge in graph.in_edges(current): if in_edge.src not in enqueued: queue.append(in_edge.src) enqueued.add(in_edge.src) if self.debug and array_closest_ancestor: print( f"ReduceExpansion::Closest ancestor={array_closest_ancestor}" ) elif self.debug: print("ReduceExpansion::No closest ancestor found") if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register # remove WCRs from all edges where possible if there is no # prior occurrence if array_closest_ancestor is None: nstate.out_edges(outer_exit)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 else: # remove WCR from outer exit nstate.out_edges(outer_exit)[0].data.wcr = None if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation # replace inner map with new reduction node edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: if self.debug: print( "ReduceExpansion::Trying to infer reduction WCR type due to out transient created" ) # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])