def unparse_tasklet(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): state_dfg: SDFGState = sdfg.nodes()[state_id] callsite_stream.write('\n///////////////////') callsite_stream.write(f'// Tasklet code ({node.label})') # Determine all defined symbols for the Unparser (for inference) # Constants and other defined symbols defined_symbols = state_dfg.symbols_defined_at(node) defined_symbols.update({ k: v.dtype if hasattr(v, 'dtype') else dtypes.typeclass(type(v)) for k, v in sdfg.constants.items() }) # All memlets of that node memlets = {} for edge in state_dfg.all_edges(node): u, uconn, v, vconn, _ = edge if u == node and uconn in u.out_connectors: defined_symbols.update({uconn: u.out_connectors[uconn]}) elif v == node and vconn in v.in_connectors: defined_symbols.update({vconn: v.in_connectors[vconn]}) body = node.code.code for stmt in body: stmt = copy.deepcopy(stmt) result = StringIO() dace.codegen.targets.sve.unparse.SVEUnparser( sdfg, dfg, self.current_map, self.cpu_codegen, stmt, result, body, memlets, util.get_loop_predicate(sdfg, dfg, node), self.counter_type, defined_symbols, self.stream_associations, self.wcr_associations) callsite_stream.write(result.getvalue(), sdfg, state_id, node) callsite_stream.write('///////////////////\n\n')
def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): scope = util.get_sve_scope(sdfg, dfg, src_node) if scope is None: raise NotImplementedError('Not in an SVE scope') out_conn = src_node.out_connectors[edge.src_conn] if out_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {out_conn.type} not supported') if edge.data.wcr is None: # No WCR required if isinstance(dst_node, dace.nodes.Tasklet): # Writeback into a tasklet is just writing into the shared register callsite_stream.write(f'{edge.data.data} = {edge.src_conn};') return if isinstance(out_conn, dtypes.vector): # If no WCR, we can directly store the vector (SVE register) in memory # Determine the stride of the store and use a scatter load if applicable stride = self.get_load_stride(sdfg, dfg, src_node, edge.data) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif out_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, src_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer), ) if stride == 1: callsite_stream.write( f'svst1({store_args}, {edge.src_conn});') else: callsite_stream.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});' ) else: raise NotImplementedError('Writeback into non-vector') else: # TODO: Check what are we WCR'ing in? # Since we have WCR, we must determine a suitable SVE reduce instruction # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') # If the memlet contains the innermost SVE param, we have a problem, because # SVE doesn't support WCR stores. This would require unrolling the loop. if scope.params[-1] in edge.data.free_symbols: raise util.NotSupportedError( 'SVE loop param used in WCR memlet') # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality sve_reduction = '{}({}, {})'.format( util.REDUCTION_TYPE_TO_SVE[reduction_type], util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(long long*) ' elif out_conn.type == np.uint64: ptr_cast = '(unsigned long long*) ' wcr_expr = self.cpu_codegen.write_and_resolve_expr( sdfg, edge.data, edge.data.wcr_nonatomic, None, ptr_cast + sve_reduction, dtype=out_conn.vtype) callsite_stream.write(wcr_expr + ';')
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream): # We should always be in an SVE scope scope = util.get_sve_scope(sdfg, dfg, dst_node) if scope is None: raise NotImplementedError('Not in an SVE scope') in_conn = dst_node.in_connectors[edge.dst_conn] if isinstance(src_node, dace.nodes.Tasklet): # Copy from tasklet is just copying the shared register # Use defined_vars to get the C++ type of the shared register callsite_stream.write( f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};' ) return if not isinstance(src_node, dace.nodes.AccessNode): raise util.NotSupportedError( 'Copy neither from Tasklet nor AccessNode') src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # A copy from a stream will trigger a vector pop raise NotImplementedError() # FIXME: Issue when we can pop different amounts of data! # If we limit to the smallest amount, certain data will be lost (never processed) """ # SVE register where the stream will be popped to self.create_empty_definition(in_conn, edge, callsite_stream, output=True) var_name = edge.dst_conn callsite_stream.write( f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};') callsite_stream.write('{') callsite_stream.write('// Stream pop') # Pop into local buffer # 256 // in_conn.vtype.bytes n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}' callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];') callsite_stream.write( f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});') # Limit the loop predicate loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node) callsite_stream.write( f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));') # Transfer to register callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);') callsite_stream.write('}') """ return if isinstance(in_conn, dtypes.vector): # Copy from vector, so we can use svld if in_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {in_conn.type} not supported') self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector, in_conn.ctype) # Determine the stride of the load and use a gather if applicable stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type], edge.dst_conn) ptr_cast = '' if in_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif in_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer)) if stride == 1: callsite_stream.write('{} = svld1({});'.format( load_lhs, load_args)) else: callsite_stream.write( '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(in_conn).bytes * 8, sym2cpp(stride))) else: # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def generate_writeback(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for a writeback in a Tasklet, given the outgoing edge. This is mainly taking the temporary register and writing it back. """ if edge.src_conn is None: return dst_node = state.memlet_path(edge)[-1].dst src_type = edge.src.out_connectors[edge.src_conn] src_name = edge.src_conn if isinstance(dst_node, nodes.Tasklet): ################## # Code->Code edges dst_type = edge.dst.in_connectors[edge.dst_conn] if (util.is_vector(src_type) and util.is_vector(dst_type)) or (util.is_scalar(src_type) and util.is_scalar(dst_type)): # Simply write back to shared register code.write(f'{edge.data.data} = {src_name};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast to shared vector register code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({src_name});') else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(dst_node, nodes.AccessNode): ################## # Write to AccessNode desc = dst_node.desc(sdfg) if isinstance(desc, data.Array): ################## # Write into Array if util.is_pointer(src_type): raise util.NotSupportedError('Unsupported writeback') elif util.is_vector(src_type): ################## # Scatter vector store into array stride = edge.data.get_stride(sdfg, map) # long long fix ptr_cast = '' if src_type.type == np.int64: ptr_cast = '(int64_t*) ' elif src_type.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.src), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame), ) if stride == 1: code.write(f'svst1({store_args}, {src_name});') else: code.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)}), {src_name});' ) else: ################## # Scalar write into array code.write(f'{cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)} = {src_name};') elif isinstance(desc, data.Scalar): ################## # Write into Scalar if util.is_pointer(src_type): raise util.NotSupportedError('Unsupported writeback') elif util.is_vector(src_type): if util.is_vector(desc.dtype): ################## # Vector write into vector Scalar access node code.write(f'{edge.data.data} = {src_name};') else: raise util.NotSupportedError('Unsupported writeback') else: if util.is_vector(desc.dtype): ################## # Broadcast into scalar AccessNode code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[src_type]}({src_name});') else: ################## # Scalar write into scalar AccessNode code.write(f'{edge.data.data} = {src_name};') else: raise util.NotSupportedError('Only writeback to Tasklets and AccessNodes is supported')
def generate_read(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for reads into a Tasklet, given the ingoing edge. """ if edge.dst_conn is None: return src_node = state.memlet_path(edge)[0].src dst_type = edge.dst.in_connectors[edge.dst_conn] dst_name = edge.dst_conn if isinstance(src_node, nodes.Tasklet): ################## # Code->Code edges src_type = edge.src.out_connectors[edge.src_conn] if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(src_node, nodes.AccessNode): ################## # Read from AccessNode desc = src_node.desc(sdfg) if isinstance(desc, data.Array): # Copy from array if util.is_pointer(dst_type): ################## # Pointer reference code.write( f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') elif util.is_vector(dst_type): ################## # Vector load stride = edge.data.get_stride(sdfg, map) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[dst_type.type], dst_name) # long long issue casting ptr_cast = '' if dst_type.type == np.int64: ptr_cast = '(int64_t*) ' elif dst_type.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.dst), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame)) if stride == 1: code.write('{} = svld1({});'.format(load_lhs, load_args)) else: code.write('{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(dst_type).bytes * 8, sym2cpp(stride))) else: ################## # Scalar read from array code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') elif isinstance(desc, data.Scalar): # Refer to shared variable src_type = desc.dtype if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Scalar->Code edge') else: raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported')