示例#1
0
    def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int,
                       node: nodes.Node, nodedesc: data.Data,
                       global_stream: CodeIOStream,
                       declaration_stream: CodeIOStream,
                       allocation_stream: CodeIOStream) -> None:
        if nodedesc.storage == dtypes.StorageType.SVE_Register:
            sve_type = util.TYPE_TO_SVE[nodedesc.dtype]
            self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar,
                                             sve_type)
            return

        if util.get_sve_scope(sdfg, dfg, node) is not None and isinstance(
                nodedesc, data.Scalar) and isinstance(nodedesc.dtype,
                                                      dtypes.vector):
            # Special allocation if vector Code->Code register in SVE scope
            # We prevent dace::vec<>'s and allocate SVE registers instead
            if self.dispatcher.defined_vars.has(node.data):
                sve_type = util.TYPE_TO_SVE[nodedesc.dtype.vtype]
                self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar,
                                                 sve_type)
                declaration_stream.write(f'{sve_type} {node.data};')
            return

        self.cpu_codegen.allocate_array(sdfg, dfg, state_id, node, nodedesc,
                                        global_stream, declaration_stream,
                                        allocation_stream)
示例#2
0
    def get_load_stride(self, sdfg: SDFG, state: SDFGState, node: nodes.Node,
                        memlet: dace.Memlet) -> symbolic.SymExpr:
        """Determines the stride of a load/store based on:
            - The memlet subset
            - The array strides
            - The involved SVE loop stride"""

        scope = util.get_sve_scope(sdfg, state, node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        sve_param = scope.map.params[-1]
        sve_range = scope.map.range[-1]
        sve_sym = dace.symbolic.symbol(sve_param)

        array = sdfg.arrays[memlet.data]

        # 1. Flatten the subset to a 1D-offset (using the array strides)
        offset_1 = memlet.subset.at([0] * len(array.strides), array.strides)

        if not offset_1.has(sve_sym):
            raise util.NotSupportedError("SVE param does not occur in subset")

        # 2. Replace the SVE loop param with its next (possibly strided) value
        offset_2 = offset_1.subs(sve_sym, sve_sym + sve_range[2])

        # 3. The load stride is the difference between both
        stride = (offset_2 - offset_1).simplify()

        return stride
示例#3
0
    def define_out_memlet(self, sdfg: SDFG, dfg: state.StateSubgraphView,
                          state_id: int, src_node: nodes.Node,
                          dst_node: nodes.Node, edge: graph.MultiConnectorEdge,
                          function_stream: CodeIOStream,
                          callsite_stream: CodeIOStream):
        scope = util.get_sve_scope(sdfg, dfg, src_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        self.create_empty_definition(src_node.out_connectors[edge.src_conn],
                                     edge, callsite_stream)
示例#4
0
    def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int,
                      node: nodes.Node, function_stream: CodeIOStream,
                      callsite_stream: CodeIOStream):
        self.add_header(function_stream)

        if not isinstance(node, nodes.Tasklet):
            return

        scope = util.get_sve_scope(sdfg, state, node)

        # Reset the stream variable mappings
        self.stream_associations = dict()
        self.wcr_associations = dict()

        callsite_stream.write('{')
        self.dispatcher.defined_vars.enter_scope(node)

        ##################
        # Generate tasklet

        # Inputs
        for edge in state.in_edges(node):
            self.generate_read(sdfg, state, scope.map, edge, callsite_stream)

        requires_wb = []

        # Temporary output registers
        for edge in state.out_edges(node):
            if self.generate_out_register(sdfg, state, edge, callsite_stream):
                requires_wb.append(edge)

        # Tasklet code
        self.unparse_tasklet(sdfg, state, state_id, node, function_stream,
                             callsite_stream)

        # Writeback from temporary registers to memory
        for edge in requires_wb:
            self.generate_writeback(sdfg, state, scope, edge, callsite_stream)

        self.dispatcher.defined_vars.exit_scope(node)
        callsite_stream.write('}')
示例#5
0
def vectorize(sdfg: dace.SDFG, par: str, ignored_conns: list = []):
    input_bits = set([sdfg.arrays[a].dtype.bytes * 8 for a in sdfg.arrays])
    if len(input_bits) > 1:
        raise NotImplementedError('Different data type sizes as inputs')
    input_bit_width = list(input_bits)[0]

    sdfg.apply_strict_transformations()

    # FIXME: Hardcoded for the demo machine (512 bits)
    util.SVE_LEN.set(512 / input_bit_width)

    for node, dfg in sdfg.all_nodes_recursive():
        if isinstance(node, dace.nodes.MapEntry):
            if node.params[-1] == par:
                node.schedule = dace.ScheduleType.SVE_Map
                for c in node.out_connectors:
                    edges = get_connector_edges(dfg, node, c, False)
                    vectorize_connector(sdfg, dfg, node, par, c, False)
                    for e in edges:
                        vectorize_connector(sdfg, dfg, e.dst, par, e.dst_conn,
                                            True)

    for edge, dfg in sdfg.all_edges_recursive():
        if not isinstance(dfg, dace.SDFGState):
            continue
        # Force every output connector within the graph to be a vector
        #if edge.data.wcr is None:
        #    continue
        scope = util.get_sve_scope(sdfg, dfg, edge.src)
        if scope is not None:
            vectorize_connector(sdfg, dfg, edge.src, par, edge.src_conn, False)

    # Then use a tweaked (but incorrect) version of infer_connector_types
    infer_connector_types(sdfg)

    return sdfg
示例#6
0
    def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView,
                   state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                   edge: graph.MultiConnectorEdge,
                   function_stream: CodeIOStream,
                   callsite_stream: CodeIOStream):
        scope = util.get_sve_scope(sdfg, dfg, src_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        out_conn = src_node.out_connectors[edge.src_conn]
        if out_conn.type not in util.TYPE_TO_SVE:
            raise NotImplementedError(
                f'Data type {out_conn.type} not supported')

        if edge.data.wcr is None:
            # No WCR required

            if isinstance(dst_node, dace.nodes.Tasklet):
                # Writeback into a tasklet is just writing into the shared register
                callsite_stream.write(f'{edge.data.data} = {edge.src_conn};')
                return

            if isinstance(out_conn, dtypes.vector):
                # If no WCR, we can directly store the vector (SVE register) in memory
                # Determine the stride of the store and use a scatter load if applicable

                stride = self.get_load_stride(sdfg, dfg, src_node, edge.data)

                ptr_cast = ''
                if out_conn.type == np.int64:
                    ptr_cast = '(int64_t*) '
                elif out_conn.type == np.uint64:
                    ptr_cast = '(uint64_t*) '

                store_args = '{}, {}'.format(
                    util.get_loop_predicate(sdfg, dfg, src_node),
                    ptr_cast +
                    cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer),
                )

                if stride == 1:
                    callsite_stream.write(
                        f'svst1({store_args}, {edge.src_conn});')
                else:
                    callsite_stream.write(
                        f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});'
                    )
            else:
                raise NotImplementedError('Writeback into non-vector')
        else:
            # TODO: Check what are we WCR'ing in?

            # Since we have WCR, we must determine a suitable SVE reduce instruction
            # Check whether it is a known reduction that is possible in SVE
            reduction_type = detect_reduction_type(edge.data.wcr)
            if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
                raise util.NotSupportedError('Unsupported reduction in SVE')

            # If the memlet contains the innermost SVE param, we have a problem, because
            # SVE doesn't support WCR stores. This would require unrolling the loop.
            if scope.params[-1] in edge.data.free_symbols:
                raise util.NotSupportedError(
                    'SVE loop param used in WCR memlet')

            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality

            sve_reduction = '{}({}, {})'.format(
                util.REDUCTION_TYPE_TO_SVE[reduction_type],
                util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn)

            ptr_cast = ''
            if out_conn.type == np.int64:
                ptr_cast = '(long long*) '
            elif out_conn.type == np.uint64:
                ptr_cast = '(unsigned long long*) '

            wcr_expr = self.cpu_codegen.write_and_resolve_expr(
                sdfg,
                edge.data,
                edge.data.wcr_nonatomic,
                None,
                ptr_cast + sve_reduction,
                dtype=out_conn.vtype)

            callsite_stream.write(wcr_expr + ';')
示例#7
0
    def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int,
                    src_node: nodes.Node, dst_node: nodes.Node,
                    edge: gr.MultiConnectorEdge[mm.Memlet],
                    function_stream: CodeIOStream,
                    callsite_stream: CodeIOStream):
        # We should always be in an SVE scope
        scope = util.get_sve_scope(sdfg, dfg, dst_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        in_conn = dst_node.in_connectors[edge.dst_conn]

        if isinstance(src_node, dace.nodes.Tasklet):
            # Copy from tasklet is just copying the shared register
            # Use defined_vars to get the C++ type of the shared register
            callsite_stream.write(
                f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};'
            )
            return

        if not isinstance(src_node, dace.nodes.AccessNode):
            raise util.NotSupportedError(
                'Copy neither from Tasklet nor AccessNode')

        src_desc = src_node.desc(sdfg)

        if isinstance(src_desc, dace.data.Stream):
            # A copy from a stream will trigger a vector pop
            raise NotImplementedError()

            # FIXME: Issue when we can pop different amounts of data!
            # If we limit to the smallest amount, certain data will be lost (never processed)
            """
            # SVE register where the stream will be popped to
            self.create_empty_definition(in_conn, edge, callsite_stream, output=True)

            var_name = edge.dst_conn

            callsite_stream.write(
                f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};')

            callsite_stream.write('{')
            callsite_stream.write('// Stream pop')

            # Pop into local buffer
            # 256 // in_conn.vtype.bytes
            n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}'
            callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];')
            callsite_stream.write(
                f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});')

            # Limit the loop predicate
            loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node)
            callsite_stream.write(
                f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));')

            # Transfer to register
            callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);')

            callsite_stream.write('}')
            """
            return

        if isinstance(in_conn, dtypes.vector):
            # Copy from vector, so we can use svld

            if in_conn.type not in util.TYPE_TO_SVE:
                raise NotImplementedError(
                    f'Data type {in_conn.type} not supported')

            self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector,
                                             in_conn.ctype)

            # Determine the stride of the load and use a gather if applicable
            stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data)

            # First part of the declaration is `type name`
            load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type],
                                      edge.dst_conn)

            ptr_cast = ''
            if in_conn.type == np.int64:
                ptr_cast = '(int64_t*) '
            elif in_conn.type == np.uint64:
                ptr_cast = '(uint64_t*) '

            # Regular load and gather share the first arguments
            load_args = '{}, {}'.format(
                util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast +
                cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer))

            if stride == 1:
                callsite_stream.write('{} = svld1({});'.format(
                    load_lhs, load_args))
            else:
                callsite_stream.write(
                    '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format(
                        load_lhs, load_args,
                        util.get_base_type(in_conn).bytes * 8, sym2cpp(stride)))
        else:
            # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen
            self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node,
                                         dst_node, edge, function_stream,
                                         callsite_stream)