def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: if nodedesc.storage == dtypes.StorageType.SVE_Register: sve_type = util.TYPE_TO_SVE[nodedesc.dtype] self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar, sve_type) return if util.get_sve_scope(sdfg, dfg, node) is not None and isinstance( nodedesc, data.Scalar) and isinstance(nodedesc.dtype, dtypes.vector): # Special allocation if vector Code->Code register in SVE scope # We prevent dace::vec<>'s and allocate SVE registers instead if self.dispatcher.defined_vars.has(node.data): sve_type = util.TYPE_TO_SVE[nodedesc.dtype.vtype] self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar, sve_type) declaration_stream.write(f'{sve_type} {node.data};') return self.cpu_codegen.allocate_array(sdfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, allocation_stream)
def get_load_stride(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, memlet: dace.Memlet) -> symbolic.SymExpr: """Determines the stride of a load/store based on: - The memlet subset - The array strides - The involved SVE loop stride""" scope = util.get_sve_scope(sdfg, state, node) if scope is None: raise NotImplementedError('Not in an SVE scope') sve_param = scope.map.params[-1] sve_range = scope.map.range[-1] sve_sym = dace.symbolic.symbol(sve_param) array = sdfg.arrays[memlet.data] # 1. Flatten the subset to a 1D-offset (using the array strides) offset_1 = memlet.subset.at([0] * len(array.strides), array.strides) if not offset_1.has(sve_sym): raise util.NotSupportedError("SVE param does not occur in subset") # 2. Replace the SVE loop param with its next (possibly strided) value offset_2 = offset_1.subs(sve_sym, sve_sym + sve_range[2]) # 3. The load stride is the difference between both stride = (offset_2 - offset_1).simplify() return stride
def define_out_memlet(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): scope = util.get_sve_scope(sdfg, dfg, src_node) if scope is None: raise NotImplementedError('Not in an SVE scope') self.create_empty_definition(src_node.out_connectors[edge.src_conn], edge, callsite_stream)
def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): self.add_header(function_stream) if not isinstance(node, nodes.Tasklet): return scope = util.get_sve_scope(sdfg, state, node) # Reset the stream variable mappings self.stream_associations = dict() self.wcr_associations = dict() callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(node) ################## # Generate tasklet # Inputs for edge in state.in_edges(node): self.generate_read(sdfg, state, scope.map, edge, callsite_stream) requires_wb = [] # Temporary output registers for edge in state.out_edges(node): if self.generate_out_register(sdfg, state, edge, callsite_stream): requires_wb.append(edge) # Tasklet code self.unparse_tasklet(sdfg, state, state_id, node, function_stream, callsite_stream) # Writeback from temporary registers to memory for edge in requires_wb: self.generate_writeback(sdfg, state, scope, edge, callsite_stream) self.dispatcher.defined_vars.exit_scope(node) callsite_stream.write('}')
def vectorize(sdfg: dace.SDFG, par: str, ignored_conns: list = []): input_bits = set([sdfg.arrays[a].dtype.bytes * 8 for a in sdfg.arrays]) if len(input_bits) > 1: raise NotImplementedError('Different data type sizes as inputs') input_bit_width = list(input_bits)[0] sdfg.apply_strict_transformations() # FIXME: Hardcoded for the demo machine (512 bits) util.SVE_LEN.set(512 / input_bit_width) for node, dfg in sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.MapEntry): if node.params[-1] == par: node.schedule = dace.ScheduleType.SVE_Map for c in node.out_connectors: edges = get_connector_edges(dfg, node, c, False) vectorize_connector(sdfg, dfg, node, par, c, False) for e in edges: vectorize_connector(sdfg, dfg, e.dst, par, e.dst_conn, True) for edge, dfg in sdfg.all_edges_recursive(): if not isinstance(dfg, dace.SDFGState): continue # Force every output connector within the graph to be a vector #if edge.data.wcr is None: # continue scope = util.get_sve_scope(sdfg, dfg, edge.src) if scope is not None: vectorize_connector(sdfg, dfg, edge.src, par, edge.src_conn, False) # Then use a tweaked (but incorrect) version of infer_connector_types infer_connector_types(sdfg) return sdfg
def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): scope = util.get_sve_scope(sdfg, dfg, src_node) if scope is None: raise NotImplementedError('Not in an SVE scope') out_conn = src_node.out_connectors[edge.src_conn] if out_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {out_conn.type} not supported') if edge.data.wcr is None: # No WCR required if isinstance(dst_node, dace.nodes.Tasklet): # Writeback into a tasklet is just writing into the shared register callsite_stream.write(f'{edge.data.data} = {edge.src_conn};') return if isinstance(out_conn, dtypes.vector): # If no WCR, we can directly store the vector (SVE register) in memory # Determine the stride of the store and use a scatter load if applicable stride = self.get_load_stride(sdfg, dfg, src_node, edge.data) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif out_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, src_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer), ) if stride == 1: callsite_stream.write( f'svst1({store_args}, {edge.src_conn});') else: callsite_stream.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});' ) else: raise NotImplementedError('Writeback into non-vector') else: # TODO: Check what are we WCR'ing in? # Since we have WCR, we must determine a suitable SVE reduce instruction # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') # If the memlet contains the innermost SVE param, we have a problem, because # SVE doesn't support WCR stores. This would require unrolling the loop. if scope.params[-1] in edge.data.free_symbols: raise util.NotSupportedError( 'SVE loop param used in WCR memlet') # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality sve_reduction = '{}({}, {})'.format( util.REDUCTION_TYPE_TO_SVE[reduction_type], util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(long long*) ' elif out_conn.type == np.uint64: ptr_cast = '(unsigned long long*) ' wcr_expr = self.cpu_codegen.write_and_resolve_expr( sdfg, edge.data, edge.data.wcr_nonatomic, None, ptr_cast + sve_reduction, dtype=out_conn.vtype) callsite_stream.write(wcr_expr + ';')
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream): # We should always be in an SVE scope scope = util.get_sve_scope(sdfg, dfg, dst_node) if scope is None: raise NotImplementedError('Not in an SVE scope') in_conn = dst_node.in_connectors[edge.dst_conn] if isinstance(src_node, dace.nodes.Tasklet): # Copy from tasklet is just copying the shared register # Use defined_vars to get the C++ type of the shared register callsite_stream.write( f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};' ) return if not isinstance(src_node, dace.nodes.AccessNode): raise util.NotSupportedError( 'Copy neither from Tasklet nor AccessNode') src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # A copy from a stream will trigger a vector pop raise NotImplementedError() # FIXME: Issue when we can pop different amounts of data! # If we limit to the smallest amount, certain data will be lost (never processed) """ # SVE register where the stream will be popped to self.create_empty_definition(in_conn, edge, callsite_stream, output=True) var_name = edge.dst_conn callsite_stream.write( f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};') callsite_stream.write('{') callsite_stream.write('// Stream pop') # Pop into local buffer # 256 // in_conn.vtype.bytes n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}' callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];') callsite_stream.write( f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});') # Limit the loop predicate loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node) callsite_stream.write( f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));') # Transfer to register callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);') callsite_stream.write('}') """ return if isinstance(in_conn, dtypes.vector): # Copy from vector, so we can use svld if in_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {in_conn.type} not supported') self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector, in_conn.ctype) # Determine the stride of the load and use a gather if applicable stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type], edge.dst_conn) ptr_cast = '' if in_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif in_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer)) if stride == 1: callsite_stream.write('{} = svld1({});'.format( load_lhs, load_args)) else: callsite_stream.write( '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(in_conn).bytes * 8, sym2cpp(stride))) else: # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)