def assert_type_compatibility(defined_symbols: collections.OrderedDict, types: tuple): """ This method ensures that SVE can work with the given types. This is sometimes more, sometimes less restrictive than C standards. """ # Sanity check for any failed inference if None in types: raise IncompatibleTypeError('`None` was given', types) # Find all unique vector, pointer and scalar types # TODO: Better way to determine uniqueness vec_types = list(set([t for t in types if isinstance(t, dtypes.vector)])) ptr_types = list(set([t for t in types if isinstance(t, dtypes.pointer)])) scal_types = list( set([ t for t in types if not isinstance(t, (dtypes.vector, dtypes.pointer)) ])) # Check if we can represent the types in SVE for t in types: if util.get_base_type(t).type not in util.TYPE_TO_SVE: raise IncompatibleTypeError('Not available in SVE', types) # Check if we have different vector types (would require casting, not implemented yet) if len(vec_types) > 1: raise IncompatibleTypeError('Vectors of different type', types) # Ensure no mixing of pointers and vectors/scalars ever occurs (totally incompatible) if (len(vec_types) != 0 or len(scal_types) != 0) and len(ptr_types) != 0: raise IncompatibleTypeError( 'Vectors/scalars are incompatible with pointers', types)
def visit_BinOp(self, t): self.visit(t.left) self.visit(t.right) if util.only_scalars_involed(self.defined_symbols, t.left, t.right): return self.generic_visit(t) # Detect fused operations # MAD: These functions multiply the first two floating-point inputs and add the result to the third input. # MLA: These functions multiply the second and third floating-point inputs and add the result to the first input. # MSB: These functions multiply the first two floating-point inputs and subtract the result from the third input. # MLS: These functions multiply the second and third floating-point inputs and subtract the result from the first input. parent_op = t.op.__class__ left_op = None right_op = None if isinstance(t.left, ast.BinOp): left_op = t.left.op.__class__ if isinstance(t.right, ast.BinOp): right_op = t.right.op.__class__ args = [] name = None if parent_op == ast.Add: if left_op == ast.Mult: name = '__svmad_' args = [t.left.left, t.left.right, t.right] elif right_op == ast.Mult: name = '__svmla_' args = [t.left, t.right.left, t.right.right] elif parent_op == ast.Sub: if left_op == ast.Mult: name = '__svmsb_' args = [t.left.left, t.left.right, t.right] elif right_op == ast.Mult: name = '__svmls_' args = [t.left, t.right.left, t.right.right] # Fused ops need at least two of three arguments to be a vector if name: inferred = util.infer_ast(self.defined_symbols, *args) scalar_args = sum([util.is_scalar(tp) for tp in inferred]) if scalar_args > 1: return self.generic_visit(t) # Add the type suffix for internal representation name += util.TYPE_TO_SVE_SUFFIX[util.get_base_type( dace.dtypes.result_type_of(*inferred))] return ast.copy_location( ast.Call(func=ast.Name(name, ast.Load()), args=args, keywords=[]), t) return self.generic_visit(t)
def can_be_applied(cls, state: SDFGState, candidate, expr_index, sdfg: SDFG, strict=False) -> bool: map_entry = state.node(candidate[cls.map_entry]) map_exit = state.exit_node(map_entry) current_map = map_entry.map subgraph = state.scope_subgraph(map_entry) subgraph_contents = state.scope_subgraph(map_entry, include_entry=False, include_exit=False) # Prevent infinite repeats if current_map.schedule == dace.dtypes.ScheduleType.SVE_Map: return False # Infer all connector types for later checks (without modifying the graph) inferred = infer_types.infer_connector_types(sdfg, state, subgraph) ######################## # Ensure only Tasklets and AccessNodes are within the map for node, _ in subgraph_contents.all_nodes_recursive(): if not isinstance(node, (nodes.Tasklet, nodes.AccessNode)): return False ######################## # Check for unsupported datatypes on the connectors (including on the Map itself) bit_widths = set() for node, _ in subgraph.all_nodes_recursive(): for conn in node.in_connectors: t = inferred[(node, conn, True)] bit_widths.add(util.get_base_type(t).bytes) if not t.type in sve.util.TYPE_TO_SVE: return False for conn in node.out_connectors: t = inferred[(node, conn, False)] bit_widths.add(util.get_base_type(t).bytes) if not t.type in sve.util.TYPE_TO_SVE: return False # Multiple different bit widths occuring (messes up the predicates) if len(bit_widths) > 1: return False ######################## # Check for unsupported memlets param_name = current_map.params[-1] for e, _ in subgraph.all_edges_recursive(): # Check for unsupported strides # The only unsupported strides are the ones containing the innermost # loop param because they are not constant during a vector step param_sym = symbolic.symbol(current_map.params[-1]) if param_sym in e.data.get_stride(sdfg, map_entry.map).free_symbols: return False # Check for unsupported WCR if e.data.wcr is not None: # Unsupported reduction type reduction_type = dace.frontend.operations.detect_reduction_type( e.data.wcr) if reduction_type not in sve.util.REDUCTION_TYPE_TO_SVE: return False # Param in memlet during WCR is not supported if param_name in e.data.subset.free_symbols and e.data.wcr_nonatomic: return False # vreduce is not supported dst_node = state.memlet_path(e)[-1] if isinstance(dst_node, nodes.Tasklet): if isinstance(dst_node.in_connectors[e.dst_conn], dtypes.vector): return False elif isinstance(dst_node, nodes.AccessNode): desc = dst_node.desc(sdfg) if isinstance(desc, data.Scalar) and isinstance( desc.dtype, dtypes.vector): return False ######################## # Check for invalid copies in the subgraph for node, _ in subgraph.all_nodes_recursive(): if not isinstance(node, nodes.Tasklet): continue for e in state.in_edges(node): # Check for valid copies from other tasklets and/or streams if e.data.data is not None: src_node = state.memlet_path(e)[0].src if not isinstance(src_node, (nodes.Tasklet, nodes.AccessNode)): # Make sure we only have Code->Code copies and from arrays return False if isinstance(src_node, nodes.AccessNode): src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # Stream pops are not implemented return False # Run the vector inference algorithm to check if vectorization is feasible try: inf_graph = vector_inference.infer_vectors( sdfg, state, map_entry, util.SVE_LEN, flags=vector_inference.VectorInferenceFlags.Allow_Stride, apply=False) except vector_inference.VectorInferenceException as ex: print(f'UserWarning: Vector inference failed! {ex}') return False return True
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream): # We should always be in an SVE scope scope = util.get_sve_scope(sdfg, dfg, dst_node) if scope is None: raise NotImplementedError('Not in an SVE scope') in_conn = dst_node.in_connectors[edge.dst_conn] if isinstance(src_node, dace.nodes.Tasklet): # Copy from tasklet is just copying the shared register # Use defined_vars to get the C++ type of the shared register callsite_stream.write( f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};' ) return if not isinstance(src_node, dace.nodes.AccessNode): raise util.NotSupportedError( 'Copy neither from Tasklet nor AccessNode') src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # A copy from a stream will trigger a vector pop raise NotImplementedError() # FIXME: Issue when we can pop different amounts of data! # If we limit to the smallest amount, certain data will be lost (never processed) """ # SVE register where the stream will be popped to self.create_empty_definition(in_conn, edge, callsite_stream, output=True) var_name = edge.dst_conn callsite_stream.write( f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};') callsite_stream.write('{') callsite_stream.write('// Stream pop') # Pop into local buffer # 256 // in_conn.vtype.bytes n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}' callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];') callsite_stream.write( f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});') # Limit the loop predicate loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node) callsite_stream.write( f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));') # Transfer to register callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);') callsite_stream.write('}') """ return if isinstance(in_conn, dtypes.vector): # Copy from vector, so we can use svld if in_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {in_conn.type} not supported') self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector, in_conn.ctype) # Determine the stride of the load and use a gather if applicable stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type], edge.dst_conn) ptr_cast = '' if in_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif in_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer)) if stride == 1: callsite_stream.write('{} = svld1({});'.format( load_lhs, load_args)) else: callsite_stream.write( '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(in_conn).bytes * 8, sym2cpp(stride))) else: # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] current_map = entry_node.map self.current_map = current_map if len(current_map.params) > 1: raise util.NotSupportedError('SVE map must be one dimensional') loop_types = list(set([util.get_base_type(sdfg.arrays[a].dtype) for a in sdfg.arrays])) # Edge case if no arrays are used loop_type = loop_types[0] if len(loop_types) > 0 else dace.int64 ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = {1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type}[ltype_size] callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(scope) # Define all dynamic input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) param = current_map.params[0] rng = current_map.range[0] begin, end, stride = (sym2cpp(r) for r in rng) # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write(f'{self.counter_type} {param} = {begin};') end_param = f'__{param}_to' callsite_stream.write(f'{self.counter_type} {end_param} = {end};') callsite_stream.write(f'svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') # Test for the predicate callsite_stream.write(f'while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param})) {{') # Allocate scope related memory for node, _ in scope.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): # Create empty shared registers for outputs into other tasklets for edge in state_dfg.out_edges(node): if isinstance(edge.dst, dace.nodes.Tasklet): self.generate_out_register(sdfg, state_dfg, edge, callsite_stream, True) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Increase the counting variable (according to the number of processed elements) size_letter = {1: 'b', 2: 'h', 4: 'w', 8: 'd'}[ltype_size] callsite_stream.write(f'{param} += svcnt{size_letter}() * {stride};') # Then recompute the loop predicate callsite_stream.write(f'__pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') callsite_stream.write('}') self.dispatcher.defined_vars.exit_scope(scope) callsite_stream.write('}')
def generate_read(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for reads into a Tasklet, given the ingoing edge. """ if edge.dst_conn is None: return src_node = state.memlet_path(edge)[0].src dst_type = edge.dst.in_connectors[edge.dst_conn] dst_name = edge.dst_conn if isinstance(src_node, nodes.Tasklet): ################## # Code->Code edges src_type = edge.src.out_connectors[edge.src_conn] if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(src_node, nodes.AccessNode): ################## # Read from AccessNode desc = src_node.desc(sdfg) if isinstance(desc, data.Array): # Copy from array if util.is_pointer(dst_type): ################## # Pointer reference code.write( f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') elif util.is_vector(dst_type): ################## # Vector load stride = edge.data.get_stride(sdfg, map) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[dst_type.type], dst_name) # long long issue casting ptr_cast = '' if dst_type.type == np.int64: ptr_cast = '(int64_t*) ' elif dst_type.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.dst), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame)) if stride == 1: code.write('{} = svld1({});'.format(load_lhs, load_args)) else: code.write('{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(dst_type).bytes * 8, sym2cpp(stride))) else: ################## # Scalar read from array code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') elif isinstance(desc, data.Scalar): # Refer to shared variable src_type = desc.dtype if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Scalar->Code edge') else: raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported')