Exemplo n.º 1
0
Arquivo: cpp.py Projeto: orausch/dace
def write_and_resolve_expr(sdfg, memlet, nc, outname, inname, indices=None):
    """ Helper function that emits a write_and_resolve call from a memlet. """

    redtype = operations.detect_reduction_type(memlet.wcr)

    nc = "_nc" if nc else ""
    indstr = (", " + indices) if indices is not None else ""

    reduction_tmpl = ""
    custom_reduction = ""

    # Special call for detected reduction types
    if redtype != dtypes.ReductionType.Custom:
        credtype = "dace::ReductionType::" + str(
            redtype)[str(redtype).find(".") + 1:]
        reduction_tmpl = "<%s>" % credtype
    else:
        custom_reduction = ', %s' % unparse_cr(sdfg, memlet.wcr)

    return "{oname}.write_and_resolve{nc}{tmpl}({iname}{wcr}{ind});".format(
        oname=outname,
        nc=nc,
        tmpl=reduction_tmpl,
        iname=inname,
        wcr=custom_reduction,
        ind=indstr,
    )
Exemplo n.º 2
0
    def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                    edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream,
                    callsite_stream: CodeIOStream) -> None:

        # Check whether it is a known reduction that is possible in SVE
        reduction_type = detect_reduction_type(edge.data.wcr)
        if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
            raise util.NotSupportedError('Unsupported reduction in SVE')

        nc = not is_write_conflicted(dfg, edge)
        desc = edge.src.desc(sdfg)
        if not nc or not isinstance(desc.dtype, (dtypes.pointer, dtypes.vector)):
            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality
            wcr = self.cpu_codegen.write_and_resolve_expr(sdfg, edge.data, not nc, None, '@', dtype=desc.dtype)
            callsite_stream.write(wcr[:wcr.find('@')] + util.REDUCTION_TYPE_TO_SVE[reduction_type] +
                                  f'(svptrue_{util.TYPE_TO_SVE_SUFFIX[desc.dtype]}(), ' + src_node.label +
                                  wcr[wcr.find('@') + 1:] + ');')
            return
        else:
            ######################
            # Horizontal non-atomic reduction
            raise NotImplementedError()

        return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
Exemplo n.º 3
0
 def reduction_type(self):
     # Autodetect reduction type
     redtype = detect_reduction_type(self.wcr)
     if redtype not in nutil.NCCL_SUPPORTED_OPERATIONS:
         raise ValueError(
             'NCCL only supports sum, product, min and max operations.')
     return redtype
Exemplo n.º 4
0
    def write_and_resolve_expr(self,
                               sdfg,
                               memlet,
                               nc,
                               outname,
                               inname,
                               indices=None,
                               dtype=None):
        """
        Emits a conflict resolution call from a memlet.
        """
        redtype = operations.detect_reduction_type(memlet.wcr)
        if isinstance(indices, str):
            ptr = '%s + %s' % (cpp.cpp_ptr_expr(sdfg, memlet), indices)
        else:
            ptr = cpp.cpp_ptr_expr(sdfg, memlet, indices=indices)

        if isinstance(dtype, dtypes.pointer):
            dtype = dtype.base_type

        # Special call for detected reduction types
        if redtype != dtypes.ReductionType.Custom:
            credtype = "dace::ReductionType::" + str(
                redtype)[str(redtype).find(".") + 1:]
            if isinstance(dtype, dtypes.vector):
                return (f'dace::xilinx_wcr_fixed_vec<{credtype}, '
                        f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce('
                        f'{ptr}, {inname})')
            return (
                f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce('
                f'{ptr}, {inname})')

        # General reduction
        raise NotImplementedError('General reductions not yet implemented')
Exemplo n.º 5
0
 def expansion(node, state, sdfg):
     a, b, c = _get_matmul_operands(node, state, sdfg)
     size_a = a[2]
     size_b = b[2]
     if len(size_a) == 2 and len(size_b) == 2:
         # Matrix and matrix -> GEMM
         from dace.libraries.blas.nodes.gemm import Gemm
         beta = 0.0
         if c[0].data.wcr:
             from dace.frontend import operations
             redtype = operations.detect_reduction_type(c[0].data.wcr)
             if redtype == dace.dtypes.ReductionType.Sum:
                 beta = 1.0
             else:
                 warnings.warn("Unsupported WCR in output of MatMul "
                               "library node: {}".format(c[0].data.wcr))
         gemm = Gemm(node.name + 'gemm',
                     location=node.location,
                     alpha=1.0,
                     beta=beta)
         return gemm
     elif len(size_b) == 3 and (len(size_a) in [2, 3]):
         # Batched matrix and matrix -> batched matrix multiplication
         from dace.libraries.blas.nodes.batched_matmul import BatchedMatMul
         batched = BatchedMatMul(node.name + 'bmm',
                                 location=node.location)
         return batched
     elif len(size_a) == 2 and len(size_b) == 1:
         # Matrix and vector -> GEMV
         from dace.libraries.blas.nodes.gemv import Gemv
         # Rename inputs to match dot naming
         a[0].dst_conn = "_A"
         b[0].dst_conn = "_x"
         c[0].src_conn = "_y"
         gemv = Gemv(node.name + 'gemv', location=node.location)
         return gemv
     elif len(size_a) == 1 and len(size_b) == 2:
         # Vector and matrix -> GEMV with transposed matrix
         from dace.libraries.blas.nodes.gemv import Gemv
         # Rename inputs to match dot naming
         a[0].dst_conn = "_x"
         b[0].dst_conn = "_A"
         c[0].src_conn = "_y"
         gemv = Gemv(node.name + 'gemvt',
                     location=node.location,
                     transA=True)
         return gemv
     elif len(size_a) == 1 and len(size_b) == 1:
         # Vector and vector -> dot product
         from dace.libraries.blas.nodes.dot import Dot
         # Rename inputs to match dot naming
         a[0].dst_conn = "_x"
         b[0].dst_conn = "_y"
         c[0].src_conn = "_result"
         dot = Dot(node.name + 'dot', location=node.location)
         return dot
     else:
         raise NotImplementedError("Matrix multiplication not implemented "
                                   "for shapes: {} and {}".format(
                                       size_a, size_b))
Exemplo n.º 6
0
    def apply(self, sdfg):
        graph = sdfg.node(self.state_id)

        # Avoid import loop
        from dace.transformation.dataflow.local_storage import LocalStorage

        local_storage_subgraph = {
            LocalStorage._node_a:
            self.subgraph[AccumulateTransient._map_exit],
            LocalStorage._node_b:
            self.subgraph[AccumulateTransient._outer_map_exit]
        }
        sdfg_id = sdfg.sdfg_list.index(sdfg)
        in_local_storage = LocalStorage(
            sdfg_id, self.state_id, local_storage_subgraph, self.expr_index)
        in_local_storage.array = self.array
        in_local_storage.apply(sdfg)

        # Initialize transient to zero in case of summation
        # TODO: Initialize transient in other WCR types
        memlet = graph.in_edges(in_local_storage._data_node)[0].data
        if detect_reduction_type(memlet.wcr) == dtypes.ReductionType.Sum:
            in_local_storage._data_node.setzero = True
        else:
            warnings.warn('AccumulateTransient did not properly initialize'
                          'newly-created transient!')
Exemplo n.º 7
0
    def can_be_applied(graph: SDFGState,
                       candidate,
                       expr_index,
                       sdfg,
                       strict=False):
        map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]]

        # Check if there is more than one GPU available:
        if (Config.get("compiler", "cuda", "max_number_gpus") < 2):
            return False

        # Dynamic map ranges not supported
        if has_dynamic_map_inputs(graph, map_entry):
            return False

        # Only accept maps with a default schedule
        schedule_whitelist = [dtypes.ScheduleType.Default]
        sdict = graph.scope_dict()
        parent = sdict[map_entry]
        while parent is not None:
            if parent.map.schedule not in schedule_whitelist:
                return False
            parent = sdict[parent]

        # Library nodes inside the scope are not supported
        scope_subgraph = graph.scope_subgraph(map_entry)
        for node in scope_subgraph.nodes():
            if isinstance(node, nodes.LibraryNode):
                return False

        # Custom reductions can not have an accumulate transient, as the
        # reduction would have to be split up for the ingoing memlet of the
        # accumulate transient and the outgoing memlet. Not using GPU local
        # accumulate transient only works for a small volume of data.
        map_exit = graph.exit_node(map_entry)
        for edge in graph.out_edges(map_exit):
            if edge.data.wcr is not None and operations.detect_reduction_type(
                    edge.data.wcr) == dtypes.ReductionType.Custom:
                return False

        storage_whitelist = [
            dtypes.StorageType.Default,
            dtypes.StorageType.CPU_Pinned,
            dtypes.StorageType.CPU_Heap,
            dtypes.StorageType.GPU_Global,
        ]
        for node in graph.predecessors(map_entry):
            if not isinstance(node, nodes.AccessNode):
                return False
            if node.desc(graph).storage not in storage_whitelist:
                return False

        for node in graph.successors(map_exit):
            if not isinstance(node, nodes.AccessNode):
                return False
            if node.desc(graph).storage not in storage_whitelist:
                return False

        return True
Exemplo n.º 8
0
    def __label__(self, sdfg, state):
        # Autodetect reduction type
        redtype = detect_reduction_type(self.wcr)
        if redtype == types.ReductionType.Custom:
            wcrstr = unparse(ast.parse(self.wcr).body[0].value.body)
        else:
            wcrstr = str(redtype)
            wcrstr = wcrstr[wcrstr.find('.') + 1:]  # Skip "ReductionType."

        return 'Op: {op}\nAxes: {axes}'.format(
            axes=('all' if self.axes is None else str(self.axes)), op=wcrstr)
Exemplo n.º 9
0
    def write_and_resolve_expr(self,
                               sdfg,
                               memlet,
                               nc,
                               outname,
                               inname,
                               indices=None,
                               dtype=None):
        """
        Emits a conflict resolution call from a memlet.
        """
        redtype = operations.detect_reduction_type(memlet.wcr, openmp=True)
        defined_type, _ = self._dispatcher.defined_vars.get(memlet.data)
        if isinstance(indices, str):
            ptr = '%s + %s' % (cpp.cpp_ptr_expr(
                sdfg, memlet, defined_type, is_write=True), indices)
        else:
            ptr = cpp.cpp_ptr_expr(sdfg,
                                   memlet,
                                   defined_type,
                                   indices=indices,
                                   is_write=True)

        if isinstance(dtype, dtypes.pointer):
            dtype = dtype.base_type

        # Special call for detected reduction types
        if redtype != dtypes.ReductionType.Custom:
            if redtype == dace.dtypes.ReductionType.Sub:
                # write this as an addition
                credtype = "dace::ReductionType::Sum"
                is_sub = True
            else:
                credtype = "dace::ReductionType::" + str(
                    redtype)[str(redtype).find(".") + 1:]
                is_sub = False
            if isinstance(dtype, dtypes.vector):
                return (f'dace::xilinx_wcr_fixed_vec<{credtype}, '
                        f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce('
                        f'{ptr}, {"-" if is_sub else ""}{inname})')
            return (
                f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce('
                f'{ptr}, {"-" if is_sub else ""}{inname})')

        # General reduction
        raise NotImplementedError('General reductions not yet implemented')
Exemplo n.º 10
0
    def _label(self, shape):
        result = ''
        if self.data is not None:
            result = self.data

        if self.subset is None:
            return result

        num_elements = self.subset.num_elements()
        if self.num_accesses != num_elements:
            if self.num_accesses == -1:
                result += '(dyn) '
            else:
                result += '(%s) ' % SymbolicProperty.to_string(
                    self.num_accesses)
        arrayNotation = True
        try:
            if shape is not None and reduce(operator.mul, shape, 1) == 1:
                # Don't draw array if we're accessing a single element and it's zero
                if all(s == 0 for s in self.subset.min_element()):
                    arrayNotation = False
        except TypeError:
            # Will fail if trying to check the truth value of a sympy expr
            pass
        if arrayNotation:
            result += '[%s]' % str(self.subset)
        if self.wcr is not None and str(self.wcr) != '':
            # Autodetect reduction type
            redtype = detect_reduction_type(self.wcr)
            if redtype == dtypes.ReductionType.Custom:
                wcrstr = unparse(ast.parse(self.wcr).body[0].value.body)
            else:
                wcrstr = str(redtype)
                wcrstr = wcrstr[wcrstr.find('.') + 1:]  # Skip "ReductionType."

            result += ' (CR: %s' % wcrstr
            if self.wcr_identity is not None:
                result += ', id: %s' % str(self.wcr_identity)
            result += ')'

        if self.other_subset is not None:
            result += ' -> [%s]' % str(self.other_subset)
        return result
Exemplo n.º 11
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        localcode = CodeIOStream()

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        output_memlet = output_edge.data

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype
        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Try to obtain the number of threads in the block, or use the default
        # configuration
        block_threads = devicelevel_block_size(sdfg, state, node)
        if block_threads is not None:
            block_threads = functools.reduce(lambda a, b: a * b, block_threads,
                                             1)

        # Checks
        if block_threads is None:
            raise ValueError('Block-wide GPU reduction must occur within'
                             ' a GPU kernel')
        if issymbolic(block_threads, sdfg.constants):
            raise ValueError('Block size has to be constant for block-wide '
                             'reduction (got %s)' % str(block_threads))
        if (node.axes is not None and len(node.axes) < input_dims):
            raise ValueError(
                'Only full reduction is supported for block-wide reduce,'
                ' please use the pure expansion')
        if (input_data.storage != dtypes.StorageType.Register
                or output_data.storage != dtypes.StorageType.Register):
            raise ValueError(
                'Block-wise reduction only supports GPU register inputs '
                'and outputs')
        if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES:
            raise ValueError('%s block reduction not supported' % redtype)

        credtype = 'dace::ReductionType::' + str(
            redtype)[str(redtype).find('.') + 1:]
        if redtype == dtypes.ReductionType.Custom:
            redop = '__reduce_%s()' % idstr
        else:
            redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)

        # Allocate shared memory for block reduce
        localcode.write("""
        typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id};
        __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id};
            """.format(id=idstr,
                       type=output_data.dtype.ctype,
                       numthreads=block_threads))

        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = cpp_array_expr(sdfg, output_memlet)
        localcode.write("""
            {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop});
            """.format(id=idstr,
                       redop=redop,
                       input=input_memlet.data,
                       output=output))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Exemplo n.º 12
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        output_dims = len(output_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        cuda_initcode = CodeIOStream()
        cuda_exitcode = CodeIOStream()
        host_globalcode = CodeIOStream()
        host_localcode = CodeIOStream()
        output_memlet = output_edge.data

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype

        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        reduce_shape = input_memlet.subset.bounding_box_size()
        num_items = ' * '.join(symstr(s) for s in reduce_shape)
        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = (output_memlet.data + ' + ' +
                  cpp_array_expr(sdfg, output_memlet, with_brackets=False))

        input_dims = input_memlet.subset.dims()
        output_dims = output_memlet.subset.data_dims()

        reduce_all_axes = (node.axes is None or len(node.axes) == input_dims)
        if reduce_all_axes:
            reduce_last_axes = False
        else:
            reduce_last_axes = sorted(node.axes) == list(
                range(input_dims - len(node.axes), input_dims))

        if (not reduce_all_axes) and (not reduce_last_axes):
            raise NotImplementedError(
                'Multiple axis reductions not supported on GPUs. Please use '
                'the pure expansion or make reduce axes the last in the array.'
            )

        # Verify that data is on the GPU
        if input_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Input of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')
        if output_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Output of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')

        # Determine reduction type
        kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype
                 in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce')

        # Create temp memory for this GPU
        cuda_globalcode.write(
            """
            void *__cub_storage_{sdfg}_{state}_{node} = NULL;
            size_t __cub_ssize_{sdfg}_{state}_{node} = 0;
        """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg,
            state_id, node)

        if reduce_all_axes:
            reduce_type = 'DeviceReduce'
            reduce_range = num_items
            reduce_range_def = 'size_t num_items'
            reduce_range_use = 'num_items'
            reduce_range_call = num_items
        elif reduce_last_axes:
            num_reduce_axes = len(node.axes)
            not_reduce_axes = reduce_shape[:-num_reduce_axes]
            reduce_axes = reduce_shape[-num_reduce_axes:]

            num_segments = ' * '.join([symstr(s) for s in not_reduce_axes])
            segment_size = ' * '.join([symstr(s) for s in reduce_axes])

            reduce_type = 'DeviceSegmentedReduce'
            iterator = 'dace::stridedIterator({size})'.format(
                size=segment_size)
            reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments,
                                                          it=iterator)
            reduce_range_def = 'size_t num_segments, size_t segment_size'
            iterator_use = 'dace::stridedIterator(segment_size)'
            reduce_range_use = 'num_segments, {it}, {it} + 1'.format(
                it=iterator_use)
            reduce_range_call = '%s, %s' % (num_segments, segment_size)

        # Call CUB to get the storage size, allocate and free it
        cuda_initcode.write(
            """
            cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node},
                                        ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop});
            cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node});
""".format(sdfg=sdfg.name,
           state=state_id,
           node=node_id,
           reduce_type=reduce_type,
           reduce_range=reduce_range,
           redop=reduce_op,
           intype=input_data.dtype.ctype,
           outtype=output_data.dtype.ctype,
           kname=kname), sdfg, state_id, node)

        cuda_exitcode.write(
            'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format(
                sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id,
            node)

        # Write reduction function definition
        cuda_globalcode.write("""
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream)
{{
cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id},
                            input, output, {reduce_range_use}{redop}, stream);
}}
        """.format(id=idstr,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype,
                   reduce_type=reduce_type,
                   reduce_range_def=reduce_range_def,
                   reduce_range_use=reduce_range_use,
                   kname=kname,
                   redop=reduce_op))

        # Write reduction function definition in caller file
        host_globalcode.write(
            """
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
        """.format(id=idstr,
                   reduce_range_def=reduce_range_def,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype), sdfg, state_id, node)

        # Call reduction function where necessary
        host_localcode.write(
            '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);'
            .format(id=idstr,
                    input=input,
                    output=output,
                    reduce_range_call=reduce_range_call))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   host_localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(host_globalcode.getvalue())
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')
        sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda')
        sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Exemplo n.º 13
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Get reduction type for OpenMP
        redtype = detect_reduction_type(node.wcr, openmp=True)
        if redtype not in ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP:
            raise ValueError('Reduction type not supported for "%s"' %
                             node.wcr)
        omptype, expr = ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP[redtype]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        outer_loops = len(axes) != input_dims

        # Create OpenMP clause
        if outer_loops:
            code = '#pragma omp parallel for collapse({cdim})\n'.format(
                cdim=output_dims)
        else:
            code = ''

        from dace.codegen.targets.cpp import sym2cpp

        # Output loops
        out_offset = []
        if outer_loops:
            for i, sz in enumerate(outedge.data.subset.size()):
                code += 'for (int _o{i} = 0; _o{i} < {sz}; ++_o{i}) {{\n'.format(
                    i=i, sz=sym2cpp(sz))
                out_offset.append('_o%d * %s' %
                                  (i, sym2cpp(output_data.strides[i])))
        else:
            out_offset.append('0')

        outexpr = '_out[%s]' % ' + '.join(out_offset)

        # Write identity value first
        if node.identity is not None:
            code += '%s = %s;\n' % (outexpr, node.identity)

        # Reduction OpenMP clause
        code += '#pragma omp parallel for collapse({cdim}) ' \
          'reduction({rtype}: {oexpr})\n'.format(cdim=len(axes), rtype=omptype,
            oexpr=outexpr)

        # Reduction loops
        for i, axis in enumerate(sorted(axes)):
            sz = sym2cpp(inedge.data.subset.size()[axis])
            code += 'for (int _i{i} = 0; _i{i} < {sz}; ++_i{i}) {{\n'.format(
                i=i, sz=sz)

        # Prepare input offset expression
        in_offset = []
        ictr, octr = 0, 0
        for i in range(input_dims):
            if i in axes:
                result = '_i%d' % ictr
                ictr += 1
            else:
                result = '_o%d' % octr
                octr += 1
            in_offset.append('%s * %s' %
                             (result, sym2cpp(input_data.strides[i])))
        in_offset = ' + '.join(in_offset)

        # Reduction expression
        code += expr.format(i='_in[%s]' % in_offset, o=outexpr)
        code += '\n'

        # Closing braces
        code += '}\n' * len(axes)
        if outer_loops:
            code += '}\n' * output_dims

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   code,
                                   language=dace.Language.CPP)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Exemplo n.º 14
0
    def expansion(node: 'Reduce',
                  state: SDFGState,
                  sdfg: SDFG,
                  partial_width=16):
        '''

        :param node: the node to expand
        :param state: the state in which the node is in
        :param sdfg: the SDFG in which the node is in
        :param partial_width: Width of the inner reduction buffer. Must be
                              larger than the latency of the reduction operation on the given
                              data type
        '''
        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)
        if input_data.dtype.veclen > 1:
            raise NotImplementedError(
                'Vectorization currently not implemented for FPGA expansion of Reduce.'
            )

        nstate = nsdfg.add_state()

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            all_axis = False
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append(f'_i{ictr}')
                    ictr += 1
                else:
                    input_subset.append(f'_o{octr}')
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    f'_o{i}': f'0:{symstr(sz)}'
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm_idx = ','.join([f'_o{i}' for i in range(output_dims)])
            outm = dace.Memlet(f'_out[{outm_idx}]')
            inm_idx = ','.join(input_subset)
            inmm = dace.Memlet(f'_in[{inm_idx}]')
        else:
            all_axis = True
            ome, omx = None, None
            outm = dace.Memlet('_out[0]')
            inm_idx = ','.join([f'_i{i}' for i in range(len(axes))])
            inmm = dace.Memlet(f'_in[{inm_idx}]')

        # Add inner map, which corresponds to the range to reduce
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')

        # TODO support vectorization
        buffer_name = 'partial_results'
        nsdfg.add_array(buffer_name, (partial_width, ),
                        input_data.dtype,
                        transient=True,
                        storage=dtypes.StorageType.FPGA_Local)
        buffer = nstate.add_access(buffer_name)
        buffer_write = nstate.add_write(buffer_name)

        # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width
        init_me, init_mx = nstate.add_map(
            'partial_results_init', {'i': f'0:{partial_width}'},
            schedule=dtypes.ScheduleType.FPGA_Device,
            unroll=True)
        init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'},
                                          f'pr_out = {node.identity}')
        nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet())
        nstate.add_memlet_path(init_tasklet,
                               init_mx,
                               buffer,
                               src_conn='pr_out',
                               memlet=dace.Memlet(f'{buffer_name}[i]'))

        if not all_axis:
            nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet())

        ime, imx = nstate.add_map(
            'reduce_values', {
                f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}'
                for i, axis in enumerate(sorted(axes))
            })

        # Accumulate over partial results
        redtype = detect_reduction_type(node.wcr)
        if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR:
            raise ValueError('Reduction type not supported for "%s"' % node.wcr)
        else:
            reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[
                redtype]

        # generate flatten index considering inner map: will be used for indexing into partial results
        ranges_size = ime.range.size()
        inner_index = '+'.join(
            [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)])
        inner_op = ' + ' if len(axes) > 1 else ''
        inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}'
        partial_reduce_tasklet = nstate.add_tasklet(
            'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\
prev = buffer_in
buffer_out = {reduction_expr}''')

        if not all_axis:
            # Connect input and partial sums
            nstate.add_memlet_path(r,
                                   ome,
                                   ime,
                                   partial_reduce_tasklet,
                                   dst_conn='data_in',
                                   memlet=inmm)
        else:
            nstate.add_memlet_path(r,
                                   ime,
                                   partial_reduce_tasklet,
                                   dst_conn='data_in',
                                   memlet=inmm)
        nstate.add_memlet_path(
            buffer,
            ime,
            partial_reduce_tasklet,
            dst_conn='buffer_in',
            memlet=dace.Memlet(
                f'{buffer_name}[({inner_index})%{partial_width}]'))
        nstate.add_memlet_path(
            partial_reduce_tasklet,
            imx,
            buffer_write,
            src_conn='buffer_out',
            memlet=dace.Memlet(
                f'{buffer_name}[({inner_index})%{partial_width}]'))

        # Then perform reduction on partial results
        reduce_entry, reduce_exit = nstate.add_map(
            'reduce', {'i': f'0:{partial_width}'},
            schedule=dtypes.ScheduleType.FPGA_Device,
            unroll=True)

        reduce_tasklet = nstate.add_tasklet(
            'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\
prev = reduce_in if i > 0 else {node.identity}
reduce_out = {reduction_expr}''')
        nstate.add_memlet_path(buffer_write,
                               reduce_entry,
                               reduce_tasklet,
                               dst_conn='data_in',
                               memlet=dace.Memlet(f'{buffer_name}[i]'))

        reduce_name = 'reduce_result'
        nsdfg.add_array(reduce_name, (1, ),
                        output_data.dtype,
                        transient=True,
                        storage=dtypes.StorageType.FPGA_Local)
        reduce_read = nstate.add_access(reduce_name)
        reduce_access = nstate.add_access(reduce_name)

        if not all_axis:
            nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet())

        nstate.add_memlet_path(reduce_read,
                               reduce_entry,
                               reduce_tasklet,
                               dst_conn='reduce_in',
                               memlet=dace.Memlet(f'{reduce_name}[0]'))
        nstate.add_memlet_path(reduce_tasklet,
                               reduce_exit,
                               reduce_access,
                               src_conn='reduce_out',
                               memlet=dace.Memlet(f'{reduce_name}[0]'))

        if not all_axis:
            # Write out the result
            nstate.add_memlet_path(reduce_access, omx, w, memlet=outm)
        else:
            nstate.add_memlet_path(reduce_access, w, memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')
        nsdfg.validate()

        return nsdfg
Exemplo n.º 15
0
    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        out_storage_node = graph.out_edges(reduce_node)[0].dst
        in_storage_node = graph.in_edges(reduce_node)[0].src
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        schedule = reduce_node.schedule
        implementation = reduce_node.implementation
        if implementation and 'warp' in implementation:
            raise NotImplementedError(
                "WIP: Warp Reductions are not Implemented yet.")

        # remove the reduce identity
        # we will reassign it later after expanding
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which
        # we perform the reduction:
        # do BFS, best complexity O(V+E)

        queue = [nsdfg]
        array_closest_ancestor = None
        while len(queue) > 0:
            current = queue.pop(0)
            if isinstance(current, nodes.AccessNode):
                if current.data == out_storage_node.data:
                    # it suffices to find the first node
                    # no matter what access (ReadWrite or Read)
                    array_closest_ancestor = current
                    break
            queue.extend([in_edge.src for in_edge in graph.in_edges(current)])

        # if ancestor doesn't exist:
        #           if non-transient: create data node accessing it
        #           if transient: ancestor_node = none, set_zero on outer node

        shortcut = False
        if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \
                                        or identity is not None:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map")
            # we are lucky
            shortcut = True
            nstate.out_edges(outer_exit)[0].data.wcr = None

        else:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map "
                      "and introducing update Tasklet, "
                      "connecting with ancestor.")
            if not array_closest_ancestor:
                array_closest_ancestor = nodes.AccessNode(
                    out_storage_node.data, access=dtypes.AccessType.ReadOnly)
                graph.add_node(array_closest_ancestor)
                # array_closest_ancestor now points to the node we want to connect
                # to the map entry

            # always have to create out transient in this case
            self.create_out_transient = True

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register
            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to shared memory / default
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # first, inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)
        if not shortcut:
            reduction_type = detect_reduction_type(wcr)
            try:
                code = ReduceExpansion.reduction_type_update[reduction_type]
            except KeyError:
                raise NotImplementedError(
                    "Not yet implemented for custom reduction")

            new_tasklet = graph.add_tasklet(
                name="reduction_transient_update",
                inputs={"reduction_in", "array_in"},
                outputs={"out"},
                code=code)

            edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \
                             if self.create_out_transient \
                             else graph.out_edges(inner_exit)[0]

            new_memlet_array_inner = Memlet(data=out_storage_node.data,
                                            volume=1,
                                            subset=edge_to_remove.data.subset)
            new_memlet_array_outer = Memlet(
                data=array_closest_ancestor.data,
                volume=graph.in_edges(outer_entry)[0].data.volume,
                subset=subsets.Range.from_array(
                    sdfg.data(out_storage_node.data)))

            new_memlet_reduction = Memlet(
                data=graph.out_edges(inner_exit)[0].data.data,
                volume=1,
                subset=graph.out_edges(inner_exit)[0].data.subset)
            new_memlet_out_inner = Memlet(data=edge_to_remove.data.data,
                                          volume=1,
                                          subset=edge_to_remove.data.subset)
            new_memlet_out_outer = dcpy(new_memlet_array_outer)

            # remove old edges

            outer_edge_to_remove = None
            for edge in graph.out_edges(outer_exit):
                if edge.src == edge_to_remove.dst:
                    outer_edge_to_remove = edge

            graph.remove_edge_and_connectors(edge_to_remove)
            graph.remove_edge_and_connectors(outer_edge_to_remove)


            graph.add_edge(out_transient_node_inner if self.create_out_transient \
                                                    else inner_exit,
                           None,
                           new_tasklet,
                           "reduction_in",
                           new_memlet_reduction)

            graph.add_edge(outer_entry, None, new_tasklet, "array_in",
                           new_memlet_array_inner)
            graph.add_edge(array_closest_ancestor, None, outer_entry, None,
                           new_memlet_array_outer)
            graph.add_edge(new_tasklet, "out", outer_exit, None,
                           new_memlet_out_inner)
            graph.add_edge(outer_exit, None, out_storage_node, None,
                           new_memlet_out_outer)

            # fill map scope connectors
            graph.fill_scope_connectors()
            graph._clear_scopedict_cache()
            # wcr is already removed

        # FORNOW: choose default schedule and implementation
        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)
        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._new_reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return
Exemplo n.º 16
0
def tile_wcrs(graph_or_subgraph: GraphViewType,
              validate_all: bool,
              prefer_partial_parallelism: bool = None) -> None:
    """
    Tiles parallel write-conflict resolution maps in an SDFG, state,
    or subgraphs thereof. Reduces the number of atomic operations by tiling
    and introducing transient arrays to accumulate atomics on.
    :param graph_or_subgraph: The SDFG/state/subgraph to optimize within.
    :param validate_all: If True, runs SDFG validation after every tiling.
    :param prefer_partial_parallelism: If set, prefers extracting non-conflicted
                                       map dimensions over tiling WCR map (may
                                       not perform well if parallel dimensions
                                       are small).
    :note: This function operates in-place.
    """
    # Avoid import loops
    from dace.codegen.targets import cpp
    from dace.frontend import operations
    from dace.transformation import dataflow, helpers as xfh

    # Determine on which nodes to run the operation
    graph = graph_or_subgraph
    if isinstance(graph_or_subgraph, gr.SubgraphView):
        graph = graph_or_subgraph.graph
    if isinstance(graph, SDFG):
        for state in graph_or_subgraph.nodes():
            tile_wcrs(state, validate_all)
        return
    if not isinstance(graph, SDFGState):
        raise TypeError(
            'Graph must be a state, an SDFG, or a subgraph of either')
    sdfg = graph.parent

    edges_to_consider: Set[Tuple[gr.MultiConnectorEdge[Memlet],
                                 nodes.MapEntry]] = set()
    for edge in graph_or_subgraph.edges():
        if edge.data.wcr is not None:
            if (isinstance(edge.src, (nodes.MapExit, nodes.NestedSDFG))
                    or isinstance(edge.dst, nodes.MapEntry)):
                # Do not consider intermediate edges
                continue
            reason = cpp.is_write_conflicted_with_reason(graph, edge)
            if reason is None or not isinstance(reason, nodes.MapEntry):
                # Do not consider edges that will not generate atomics or
                # atomics we cannot transform
                continue
            if reason not in graph_or_subgraph.nodes():
                # Skip if conflict exists outside of nested SDFG
                continue

            # Check if identity value can be inferred
            redtype = operations.detect_reduction_type(edge.data.wcr)
            dtype = sdfg.arrays[edge.data.data].dtype
            identity = dtypes.reduction_identity(dtype, redtype)
            if identity is None:  # Cannot infer identity value
                continue

            edges_to_consider.add((edge, reason))

    tile_size = config.Config.get('optimizer', 'autotile_size')
    debugprint = config.Config.get_bool('debugprint')
    if prefer_partial_parallelism is None:
        prefer_partial_parallelism = config.Config.get_bool(
            'optimizer', 'autotile_partial_parallelism')

    maps_to_consider: Set[nodes.MapEntry] = set(me
                                                for _, me in edges_to_consider)

    transformed: Set[nodes.MapEntry] = set()

    # Heuristic: If the map is only partially conflicted, extract
    # parallel dimensions instead of tiling
    if prefer_partial_parallelism:
        for mapentry in maps_to_consider:
            # Check the write-conflicts of all WCR edges in map
            conflicts: Set[str] = set()
            for edge, me in edges_to_consider:
                if me is not mapentry:
                    continue
                conflicts |= set(
                    cpp.write_conflicted_map_params(mapentry, edge))

            nonconflicted_dims = set(mapentry.params) - conflicts
            if nonconflicted_dims:
                dims = [
                    i for i, p in enumerate(mapentry.params)
                    if p in nonconflicted_dims
                ]
                if ((dt._prod(s for i, s in enumerate(mapentry.range.size())
                              if i in dims) < tile_size) == True):
                    # Map has a small range, extracting parallelism may not be
                    # beneficial
                    continue
                xfh.extract_map_dims(sdfg, mapentry, dims)
                transformed.add(mapentry)

    # Tile and accumulate other not-transformed maps
    for edge, mapentry in edges_to_consider:
        if mapentry in transformed:
            continue
        transformed.add(mapentry)

        # NOTE: The test "(x < y) == True" below is crafted for SymPy
        # to be "definitely True"
        if all((s < tile_size) == True for s in mapentry.map.range.size()):
            # If smaller than tile size, don't transform and instead
            # make map sequential
            if debugprint:
                print(f'Making map "{mapentry}" sequential due to being '
                      'smaller than tile size')
            mapentry.map.schedule = dtypes.ScheduleType.Sequential
            continue

        # MapTiling -> AccumulateTransient / AccumulateStream
        outer_mapentry = dataflow.MapTiling.apply_to(
            sdfg, dict(tile_sizes=(tile_size, )), map_entry=mapentry)

        # Transform all outgoing WCR and stream edges
        mapexit = graph.exit_node(mapentry)
        outer_mapexit = graph.exit_node(outer_mapentry)

        # Tuple of (transformation type, options, pattern)
        to_apply: Tuple[Union[dataflow.StreamTransient,
                              dataflow.AccumulateTransient], Dict[str, Any],
                        Dict[str, nodes.Node]] = None
        for e in graph.out_edges(mapexit):
            if isinstance(sdfg.arrays[e.data.data], dt.Stream):
                mpath = graph.memlet_path(e)
                tasklet = mpath[0].src
                if not isinstance(tasklet, nodes.Tasklet) or len(mpath) != 3:
                    # TODO(later): Implement StreamTransient independently of tasklet
                    continue

                # Make transient only if there is one WCR/stream
                if to_apply is not None:
                    to_apply = None
                    break

                to_apply = (dataflow.StreamTransient, {},
                            dict(tasklet=tasklet,
                                 map_exit=mapexit,
                                 outer_map_exit=outer_mapexit))
            else:
                if (e.data.is_empty() or e.data.wcr is None
                        or e.data.wcr_nonatomic
                        or (e.data.dst_subset is not None
                            and e.data.dst_subset.num_elements() > 0
                            and e.data.dynamic)):
                    continue

                dtype = sdfg.arrays[e.data.data].dtype
                redtype = operations.detect_reduction_type(e.data.wcr)
                identity = dtypes.reduction_identity(dtype, redtype)
                if identity is None:  # Cannot infer identity value
                    continue
                # Make transient only if there is one WCR/stream
                if to_apply is not None:
                    to_apply = None
                    break

                to_apply = (dataflow.AccumulateTransient,
                            dict(identity=identity, array=e.data.data),
                            dict(map_exit=mapexit,
                                 outer_map_exit=outer_mapexit))
        if to_apply is not None:
            xform, opts, pattern = to_apply
            xform.apply_to(sdfg, options=opts, **pattern)

    if debugprint and len(transformed) > 0:
        print(f'Optimized {len(transformed)} write-conflicted maps')
Exemplo n.º 17
0
    def vector_reduction_expr(self, edge, dtype, rhs):
        # Check whether it is a known reduction that is possible in SVE
        reduction_type = detect_reduction_type(edge.data.wcr)
        if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
            raise util.NotSupportedError('Unsupported reduction in SVE')

        nc = not is_write_conflicted(self.dfg, edge)
        if not nc or not isinstance(edge.src.out_connectors[edge.src_conn],
                                    (dtypes.pointer, dtypes.vector)):
            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality
            dst_node = self.dfg.memlet_path(edge)[-1].dst
            if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc(
                    self.sdfg).storage == dtypes.StorageType.SVE_Register):
                return

            wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg,
                                                          edge.data,
                                                          not nc,
                                                          None,
                                                          '@',
                                                          dtype=dtype)
            self.fill(wcr[:wcr.find('@')])
            self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type])
            self.write('(')
            self.write(self.pred_name)
            self.write(', ')
            self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
            self.write(')')
            self.write(wcr[wcr.find('@') + 1:])
            self.write(';')
        else:
            ######################
            # Horizontal non-atomic reduction

            stride = edge.data.get_stride(self.sdfg, self.map)

            # long long fix
            ptr_cast = ''
            src_type = edge.src.out_connectors[edge.src_conn]

            if src_type.type == np.int64:
                ptr_cast = '(int64_t*) '
            elif src_type.type == np.uint64:
                ptr_cast = '(uint64_t*) '

            store_args = '{}, {}'.format(
                self.pred_name,
                ptr_cast +
                cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer),
            )

            red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x'
            if stride == 1:
                self.write(
                    f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
            else:
                store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})'
                self.write(
                    f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
Exemplo n.º 18
0
    def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView,
                   state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                   edge: graph.MultiConnectorEdge,
                   function_stream: CodeIOStream,
                   callsite_stream: CodeIOStream):
        scope = util.get_sve_scope(sdfg, dfg, src_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        out_conn = src_node.out_connectors[edge.src_conn]
        if out_conn.type not in util.TYPE_TO_SVE:
            raise NotImplementedError(
                f'Data type {out_conn.type} not supported')

        if edge.data.wcr is None:
            # No WCR required

            if isinstance(dst_node, dace.nodes.Tasklet):
                # Writeback into a tasklet is just writing into the shared register
                callsite_stream.write(f'{edge.data.data} = {edge.src_conn};')
                return

            if isinstance(out_conn, dtypes.vector):
                # If no WCR, we can directly store the vector (SVE register) in memory
                # Determine the stride of the store and use a scatter load if applicable

                stride = self.get_load_stride(sdfg, dfg, src_node, edge.data)

                ptr_cast = ''
                if out_conn.type == np.int64:
                    ptr_cast = '(int64_t*) '
                elif out_conn.type == np.uint64:
                    ptr_cast = '(uint64_t*) '

                store_args = '{}, {}'.format(
                    util.get_loop_predicate(sdfg, dfg, src_node),
                    ptr_cast +
                    cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer),
                )

                if stride == 1:
                    callsite_stream.write(
                        f'svst1({store_args}, {edge.src_conn});')
                else:
                    callsite_stream.write(
                        f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});'
                    )
            else:
                raise NotImplementedError('Writeback into non-vector')
        else:
            # TODO: Check what are we WCR'ing in?

            # Since we have WCR, we must determine a suitable SVE reduce instruction
            # Check whether it is a known reduction that is possible in SVE
            reduction_type = detect_reduction_type(edge.data.wcr)
            if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
                raise util.NotSupportedError('Unsupported reduction in SVE')

            # If the memlet contains the innermost SVE param, we have a problem, because
            # SVE doesn't support WCR stores. This would require unrolling the loop.
            if scope.params[-1] in edge.data.free_symbols:
                raise util.NotSupportedError(
                    'SVE loop param used in WCR memlet')

            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality

            sve_reduction = '{}({}, {})'.format(
                util.REDUCTION_TYPE_TO_SVE[reduction_type],
                util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn)

            ptr_cast = ''
            if out_conn.type == np.int64:
                ptr_cast = '(long long*) '
            elif out_conn.type == np.uint64:
                ptr_cast = '(unsigned long long*) '

            wcr_expr = self.cpu_codegen.write_and_resolve_expr(
                sdfg,
                edge.data,
                edge.data.wcr_nonatomic,
                None,
                ptr_cast + sve_reduction,
                dtype=out_conn.vtype)

            callsite_stream.write(wcr_expr + ';')
Exemplo n.º 19
0
    def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry:
        me = self.mapentry

        # Add new map within map
        mx = graph.exit_node(me)
        new_me, new_mx = graph.add_map('warp_tile',
                                       dict(__tid=f'0:{self.warp_size}'),
                                       dtypes.ScheduleType.GPU_ThreadBlock)
        __tid = symbolic.pystr_to_symbolic('__tid')
        for e in graph.out_edges(me):
            xfh.reconnect_edge_through_map(graph, e, new_me, True)
        for e in graph.in_edges(mx):
            xfh.reconnect_edge_through_map(graph, e, new_mx, False)

        # Stride and offset all internal maps
        maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True)
        for nstate, nmap in maps_to_stride:
            nsdfg = nstate.parent
            nsdfg_node = nsdfg.parent_nsdfg_node

            # Map cannot be partitioned across a warp
            if (nmap.range.size()[-1] < self.warp_size) == True:
                continue

            if nsdfg is not sdfg and nsdfg_node is not None:
                nsdfg_node.symbol_mapping['__tid'] = __tid
                if '__tid' not in nsdfg.symbols:
                    nsdfg.add_symbol('__tid', dtypes.int32)
            nmap.range[-1] = (nmap.range[-1][0], nmap.range[-1][1] - __tid,
                              nmap.range[-1][2] * self.warp_size)
            subgraph = nstate.scope_subgraph(nmap)
            subgraph.replace(nmap.params[-1], f'{nmap.params[-1]} + __tid')
            inner_map_exit = nstate.exit_node(nmap)
            # If requested, replicate maps with multiple dependent maps
            if self.replicate_maps:
                destinations = [
                    nstate.memlet_path(edge)[-1].dst
                    for edge in nstate.out_edges(inner_map_exit)
                ]

                for dst in destinations:
                    # Transformation will not replicate map with more than one
                    # output
                    if len(destinations) != 1:
                        break
                    if not isinstance(dst, nodes.AccessNode):
                        continue  # Not leading to access node
                    if not xfh.contained_in(nstate, dst, new_me):
                        continue  # Memlet path goes out of map
                    if not nsdfg.arrays[dst.data].transient:
                        continue  # Cannot modify non-transients
                    for edge in nstate.out_edges(dst)[1:]:
                        rep_subgraph = xfh.replicate_scope(
                            nsdfg, nstate, subgraph)
                        rep_edge = nstate.out_edges(
                            rep_subgraph.sink_nodes()[0])[0]
                        # Add copy of data
                        newdesc = copy.deepcopy(sdfg.arrays[dst.data])
                        newname = nsdfg.add_datadesc(dst.data,
                                                     newdesc,
                                                     find_new_name=True)
                        newaccess = nstate.add_access(newname)
                        # Redirect edges
                        xfh.redirect_edge(nstate,
                                          rep_edge,
                                          new_dst=newaccess,
                                          new_data=newname)
                        xfh.redirect_edge(nstate,
                                          edge,
                                          new_src=newaccess,
                                          new_data=newname)

            # If has WCR, add warp-collaborative reduction on outputs
            for out_edge in nstate.out_edges(inner_map_exit):
                dst = nstate.memlet_path(out_edge)[-1].dst
                if not xfh.contained_in(nstate, dst, new_me):
                    # Skip edges going out of map
                    continue
                if dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global:
                    # Skip shared memory
                    continue
                if out_edge.data.wcr is not None:
                    ctype = nsdfg.arrays[out_edge.data.data].dtype.ctype
                    redtype = detect_reduction_type(out_edge.data.wcr)
                    if redtype == dtypes.ReductionType.Custom:
                        raise NotImplementedError
                    credtype = ('dace::ReductionType::' +
                                str(redtype)[str(redtype).find('.') + 1:])

                    # One element: tasklet
                    if out_edge.data.subset.num_elements() == 1:
                        # Add local access between thread-local and warp reduction
                        name = nsdfg._find_new_name(out_edge.data.data)
                        nsdfg.add_scalar(
                            name,
                            nsdfg.arrays[out_edge.data.data].dtype,
                            transient=True)

                        # Initialize thread-local to global value
                        read = nstate.add_read(out_edge.data.data)
                        write = nstate.add_write(name)
                        edge = nstate.add_nedge(read, write,
                                                copy.deepcopy(out_edge.data))
                        edge.data.wcr = None
                        xfh.state_fission(nsdfg,
                                          SubgraphView(nstate, [read, write]))

                        newnode = nstate.add_access(name)
                        nstate.remove_edge(out_edge)
                        edge = nstate.add_edge(out_edge.src, out_edge.src_conn,
                                               newnode, None,
                                               copy.deepcopy(out_edge.data))
                        for e in nstate.memlet_path(edge):
                            e.data.data = name
                            e.data.subset = subsets.Range([(0, 0, 1)])

                        wrt = nstate.add_tasklet(
                            'warpreduce', {'__a'}, {'__out'},
                            f'__out = dace::warpReduce<{credtype}, {ctype}>::reduce(__a);',
                            dtypes.Language.CPP)
                        nstate.add_edge(newnode, None, wrt, '__a',
                                        Memlet(name))
                        out_edge.data.wcr = None
                        nstate.add_edge(wrt, '__out', out_edge.dst, None,
                                        out_edge.data)
                    else:  # More than one element: mapped tasklet
                        # Could be a parallel summation
                        # TODO(later): Check if reduction
                        continue
            # End of WCR to warp reduction

        # Make nested SDFG out of new scope
        xfh.nest_state_subgraph(sdfg, graph,
                                graph.scope_subgraph(new_me, False, False))

        return new_me
Exemplo n.º 20
0
    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        # get out storage node, might be hidden behind view node
        out_data = graph.out_edges(reduce_node)[0].data
        out_storage_node = reduce_node
        while not isinstance(out_storage_node, nodes.AccessNode):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst

        if isinstance(sdfg.data(out_storage_node.data), View):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst
            while not isinstance(out_storage_node, nodes.AccessNode):
                out_storage_node = graph.out_edges(out_storage_node)[0].dst

        # get other useful quantities from the original reduce node
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        implementation = reduce_node.implementation

        # remove the reduce identity, will get reassigned after expansion
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS

        if self.create_out_transient:
            queue = [nsdfg]
            enqueued = set()
            array_closest_ancestor = None

            while len(queue) > 0:
                current = queue.pop()
                if isinstance(current, nodes.AccessNode):
                    if current.data == out_storage_node.data:
                        # it suffices to find the first node
                        # no matter what access (ReadWrite or Read)
                        array_closest_ancestor = current
                        break
                for in_edge in graph.in_edges(current):
                    if in_edge.src not in enqueued:
                        queue.append(in_edge.src)
                        enqueued.add(in_edge.src)

            if self.debug and array_closest_ancestor:
                print(
                    f"ReduceExpansion::Closest ancestor={array_closest_ancestor}"
                )
            elif self.debug:
                print("ReduceExpansion::No closest ancestor found")

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

            # remove WCRs from all edges where possible if there is no
            # prior occurrence
            if array_closest_ancestor is None:
                nstate.out_edges(outer_exit)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1
        else:

            # remove WCR from outer exit
            nstate.out_edges(outer_exit)[0].data.wcr = None

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)

        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        # replace inner map with new reduction node
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)

        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            if self.debug:
                print(
                    "ReduceExpansion::Trying to infer reduction WCR type due to out transient created"
                )
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return
Exemplo n.º 21
0
    def apply(self, sdfg: SDFG) -> None:
        graph: SDFGState = sdfg.nodes()[self.state_id]

        inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[
            GPUMultiTransformMap._map_entry]]

        number_of_gpus = self.number_of_gpus
        ngpus = Config.get("compiler", "cuda", "max_number_gpus")
        if (number_of_gpus == None):
            number_of_gpus = ngpus
        if number_of_gpus > ngpus:
            raise ValueError(
                'Requesting more gpus than specified in the dace config')

        # Avoiding import loops
        from dace.transformation.dataflow import (StripMining, InLocalStorage,
                                                  OutLocalStorage,
                                                  AccumulateTransient)

        # The user has responsibility for the implementation of a Library node.
        scope_subgraph = graph.scope_subgraph(inner_map_entry)
        for node in scope_subgraph.nodes():
            if isinstance(node, nodes.LibraryNode):
                warnings.warn(
                    'Node %s is a library node, make sure to manually set the '
                    'implementation to a GPU compliant specialization.' % node)

        # Tile map into number_of_gpus tiles
        outer_map: nodes.Map = StripMining.apply_to(
            sdfg,
            dict(dim_idx=-1,
                 new_dim_prefix=self.new_dim_prefix,
                 tile_size=number_of_gpus,
                 tiling_type=dtypes.TilingType.NumberOfTiles),
            _map_entry=inner_map_entry)

        outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry]
        inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry)
        outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry)

        # Change map schedules
        inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device
        outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice

        symbolic_gpu_id = outer_map.params[0]

        # Add the parameter of the outer map
        for node in graph.successors(inner_map_entry):
            if isinstance(node, nodes.NestedSDFG):
                map_syms = inner_map_entry.range.free_symbols
                for sym in map_syms:
                    symname = str(sym)
                    if symname not in node.symbol_mapping.keys():
                        node.symbol_mapping[symname] = sym
                        node.sdfg.symbols[symname] = graph.symbols_defined_at(
                            node)[symname]

        # Add transient Data leading to the inner map
        prefix = self.new_transient_prefix
        for node in graph.predecessors(outer_map_entry):
            # Only AccessNodes are relevant
            if (isinstance(node, nodes.AccessNode)
                    and not (self.skip_scalar
                             and isinstance(node.desc(sdfg), Scalar))):
                if self.use_p2p and node.desc(
                        sdfg).storage is dtypes.StorageType.GPU_Global:
                    continue

                in_data_node = InLocalStorage.apply_to(sdfg,
                                                       dict(array=node.data,
                                                            prefix=prefix),
                                                       verify=False,
                                                       save=False,
                                                       node_a=outer_map_entry,
                                                       node_b=inner_map_entry)
                in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global

        wcr_data: Dict[str, Any] = {}
        # Add transient Data leading to the outer map
        for edge in graph.in_edges(outer_map_exit):
            node = graph.memlet_path(edge)[-1].dst
            if isinstance(node, nodes.AccessNode):
                data_name = node.data
                # Transients with write-conflict resolution need to be
                # collected first as AccumulateTransient creates a nestedSDFG
                if edge.data.wcr is not None:
                    dtype = sdfg.arrays[data_name].dtype
                    redtype = operations.detect_reduction_type(edge.data.wcr)
                    # Custom reduction can not have an accumulate transient,
                    # as the accumulation from the transient to the outer
                    # storage is not defined.
                    if redtype == dtypes.ReductionType.Custom:
                        warnings.warn(
                            'Using custom reductions in a GPUMultitransformed '
                            'Map only works for a small data volume. For large '
                            'volume there is no guarantee.')
                        continue
                    identity = dtypes.reduction_identity(dtype, redtype)
                    wcr_data[data_name] = identity
                elif (not isinstance(node.desc(sdfg), Scalar)
                      or not self.skip_scalar):
                    if self.use_p2p and node.desc(
                            sdfg).storage is dtypes.StorageType.GPU_Global:
                        continue
                    # Transients without write-conflict resolution
                    if prefix + '_' + data_name in sdfg.arrays:
                        create_array = False
                    else:
                        create_array = True
                    out_data_node = OutLocalStorage.apply_to(
                        sdfg,
                        dict(array=data_name,
                             prefix=prefix,
                             create_array=create_array),
                        verify=False,
                        save=False,
                        node_a=inner_map_exit,
                        node_b=outer_map_exit)
                    out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    out_data_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global

        # Add Transients for write-conflict resolution
        if len(wcr_data) != 0:
            nsdfg = AccumulateTransient.apply_to(
                sdfg,
                options=dict(array_identity_dict=wcr_data, prefix=prefix),
                map_exit=inner_map_exit,
                outer_map_exit=outer_map_exit)
            nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice
            nsdfg.location['gpu'] = symbolic_gpu_id
            for transient_node in graph.successors(nsdfg):
                if isinstance(transient_node, nodes.AccessNode):
                    transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    transient_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global
                    nsdfg.sdfg.arrays[
                        transient_node.label].location['gpu'] = symbolic_gpu_id
                    nsdfg.sdfg.arrays[
                        transient_node.
                        label].storage = dtypes.StorageType.GPU_Global
            infer_types.set_default_schedule_storage_types_and_location(
                nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice,
                symbolic_gpu_id)

        # Remove the parameter of the outer_map from the sdfg symbols,
        # as it got added as a symbol in StripMining.
        if outer_map.params[0] in sdfg.free_symbols:
            sdfg.remove_symbol(outer_map.params[0])