Пример #1
0
def create_batch_gemm_sdfg(dtype, strides):
    #########################
    sdfg = SDFG('einsum')
    state = sdfg.add_state()
    M, K, N = (symbolic.symbol(s) for s in ['M', 'K', 'N'])
    BATCH, sAM, sAK, sAB, sBK, sBN, sBB, sCM, sCN, sCB = (
        symbolic.symbol(s) if symbolic.issymbolic(strides[s]) else strides[s]
        for s in [
            'BATCH', 'sAM', 'sAK', 'sAB', 'sBK', 'sBN', 'sBB', 'sCM', 'sCN',
            'sCB'
        ])

    batched = strides['BATCH'] != 1

    _, xarr = sdfg.add_array(
        'X',
        dtype=dtype,
        shape=[BATCH, M, K] if batched else [M, K],
        strides=[sAB, sAM, sAK] if batched else [sAM, sAK])
    _, yarr = sdfg.add_array(
        'Y',
        dtype=dtype,
        shape=[BATCH, K, N] if batched else [K, N],
        strides=[sBB, sBK, sBN] if batched else [sBK, sBN])
    _, zarr = sdfg.add_array(
        'Z',
        dtype=dtype,
        shape=[BATCH, M, N] if batched else [M, N],
        strides=[sCB, sCM, sCN] if batched else [sCM, sCN])

    gX = state.add_read('X')
    gY = state.add_read('Y')
    gZ = state.add_write('Z')

    import dace.libraries.blas as blas  # Avoid import loop

    libnode = blas.MatMul('einsum_gemm')
    state.add_node(libnode)
    state.add_edge(gX, None, libnode, '_a', Memlet.from_array(gX.data, xarr))
    state.add_edge(gY, None, libnode, '_b', Memlet.from_array(gY.data, yarr))
    state.add_edge(libnode, '_c', gZ, None, Memlet.from_array(gZ.data, zarr))

    return sdfg
Пример #2
0
def test():
    print('Dynamic SDFG test with vectorization and min')
    # Externals (parameters, symbols)
    N = dp.symbol('N')
    N.set(20)

    input = np.random.rand(N.get()).astype(np.float32)
    input2 = np.random.rand(N.get()).astype(np.float32)
    output = dp.ndarray([N], dp.float32)
    output[:] = dp.float32(0)

    # Construct SDFG
    mysdfg = SDFG('myvmin')
    mysdfg.add_array('A', [N], dp.float32)
    mysdfg.add_array('B', [N], dp.float32)
    mysdfg.add_array('C', [N], dp.float32)
    state = mysdfg.add_state()
    A = state.add_access('A')
    B = state.add_access('B')
    C = state.add_access('C')

    tasklet, map_entry, map_exit = state.add_mapped_tasklet(
        'mytasklet', dict(i='0:N:2'),
        dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')),
        'c = min(a, b)', dict(c=Memlet.simple(C, 'i')))

    # Manually vectorize tasklet
    tasklet.in_connectors['a'] = dp.vector(dp.float32, 2)
    tasklet.in_connectors['b'] = dp.vector(dp.float32, 2)
    tasklet.out_connectors['c'] = dp.vector(dp.float32, 2)

    # Add outer edges
    state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N'))
    state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N'))
    state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N'))

    mysdfg(A=input, B=input2, C=output, N=N)

    diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get()
    print("Difference:", diff)
    print("==== Program end ====")
    assert diff <= 1e-5
Пример #3
0
def test_3_interface_to_2_banks():
    sdfg = SDFG("test_4_interface_to_2_banks")
    state = sdfg.add_state()

    _, desc_a = sdfg.add_array("a", [2, 2], dace.int32)
    desc_a.location["memorytype"] = "HBM"
    desc_a.location["bank"] = "0:2"
    acc_read1 = state.add_read("a")
    acc_write1 = state.add_write("a")

    t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]),
                           "_y1 = _x1 + _x2")

    m1_in, m1_out = state.add_map("m", {"k": "0:2"},
                                  dtypes.ScheduleType.Unrolled)

    state.add_memlet_path(acc_read1,
                          m1_in,
                          t1,
                          memlet=memlet.Memlet("a[0, 0]"),
                          dst_conn="_x1")
    state.add_memlet_path(acc_read1,
                          m1_in,
                          t1,
                          memlet=memlet.Memlet("a[1, 0]"),
                          dst_conn="_x2")
    state.add_memlet_path(t1,
                          m1_out,
                          acc_write1,
                          memlet=memlet.Memlet("a[0, 1]"),
                          src_conn="_y1")

    sdfg.apply_fpga_transformations()
    assert sdfg.apply_transformations(InlineSDFG) == 1
    assert sdfg.apply_transformations(MapUnroll) == 1
    for node in sdfg.states()[0].nodes():
        if isinstance(node, dace.sdfg.nodes.Tasklet):
            sdfg.states()[0].out_edges(
                node)[0].data.subset = subsets.Range.from_string("1, 1")
            break

    bank_assignment = sdfg.generate_code()[3].clean_code
    assert bank_assignment.count("sp") == 6
    assert bank_assignment.count("HBM[0]") == 3
    assert bank_assignment.count("HBM[1]") == 3

    a = np.zeros([2, 2], np.int32)
    a[0, 0] = 2
    a[1, 0] = 3
    sdfg(a=a)
    assert a[0, 1] == 5

    return sdfg
def four_interface_to_2_banks(mem_type, decouple_interfaces):
    sdfg = SDFG("test_4_interface_to_2_banks_" + mem_type)
    state = sdfg.add_state()

    _, desc_a = sdfg.add_array("a", [2, 2], dace.int32)
    desc_a.location["memorytype"] = mem_type
    desc_a.location["bank"] = "0:2"
    acc_read1 = state.add_read("a")
    acc_write1 = state.add_write("a")

    t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2")

    m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled)

    state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1")
    state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2")
    state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1")

    sdfg.apply_fpga_transformations()
    assert sdfg.apply_transformations(InlineSDFG) == 1
    assert sdfg.apply_transformations(MapUnroll) == 1
    for node in sdfg.states()[0].nodes():
        if isinstance(node, dace.sdfg.nodes.Tasklet):
            sdfg.states()[0].out_edges(node)[0].data.subset = subsets.Range.from_string("1, 1")
            break

    with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=decouple_interfaces):
        bank_assignment = sdfg.generate_code()[3].clean_code
        # if we are not decoupling array interfaces we will use less mem interfaces
        assert bank_assignment.count("sp") == 6 if decouple_interfaces else 4
        assert bank_assignment.count(mem_type + "[0]") == 3 if decouple_interfaces else 2
        assert bank_assignment.count(mem_type + "[1]") == 3 if decouple_interfaces else 2

    a = np.zeros([2, 2], np.int32)
    a[0, 0] = 2
    a[1, 0] = 3
    sdfg(a=a)
    assert a[0, 1] == 5

    return sdfg
Пример #5
0
class ONNXModel:
    """Loads an ONNX model into an SDFG."""
    def __init__(self, name, model: onnx.ModelProto, cuda=False):
        """
        Constructs a new ONNXImporter.
        :param name: the name for the SDFG.
        :param model: the model to import.
        :param cuda: if `True`, weights will be passed as cuda arrays.
        """

        graph: onnx.GraphProto = model.graph

        self.sdfg = SDFG(name)
        self.cuda = cuda
        self.state = self.sdfg.add_state()

        # Add all values to the SDFG, check for unsupported ops
        ##########################################

        self.value_infos = {}

        self.inputs = []
        self.outputs = []

        for value, is_input in chain(zip(graph.input, repeat(True)),
                                     zip(graph.output, repeat(False))):
            if not value.HasField("name"):
                raise ValueError("Got input or output without name")
            if is_input:
                self.inputs.append(value.name)
            else:
                self.outputs.append(value.name)

            self.value_infos[value.name] = value
            self._add_value_info(value)

        for value in graph.value_info:
            if not value.HasField("name"):
                raise ValueError("Got input or output without name")
            if value.name not in self.value_infos:
                self.value_infos[value.name] = value

        # add weights
        self.weights = {}
        for init in graph.initializer:
            self._add_constant_tensor(init)

        access_nodes = {}
        self._idx_to_node = []
        for i, node in enumerate(graph.node):
            if not has_onnx_node(node.op_type):
                raise ValueError("Unsupported ONNX operator: '{}'".format(
                    node.op_type))

            # extract the op attributes

            op_attributes = {
                attribute_proto.name: convert_attribute_proto(attribute_proto)
                for attribute_proto in node.attribute
            }

            if node.HasField("name"):
                node_name = clean_onnx_name(node.name)
            else:
                node_name = node.op_type + "_" + str(i)

            # construct the dace node
            op_node = get_onnx_node(node.op_type)(node_name, **op_attributes)
            self.state.add_node(op_node)
            self._idx_to_node.append(op_node)

            for param_idx, (name, is_input) in chain(
                    enumerate(zip(node.input, repeat(True))),
                    enumerate(zip(node.output, repeat(False)))):
                if clean_onnx_name(name) not in self.sdfg.arrays:
                    if name not in self.value_infos:
                        raise ValueError(
                            "Could not find array with name '{}'".format(name))
                    self._add_value_info(self.value_infos[name])

                # get the access node
                if name in access_nodes:
                    access = access_nodes[name]
                    self._update_access_type(access, is_input)
                else:
                    access = nd.AccessNode(
                        clean_onnx_name(name), AccessType.ReadOnly
                        if is_input else AccessType.WriteOnly)
                    self.state.add_node(access)
                    access_nodes[name] = access

                # get the connector name
                params = op_node.schema.inputs if is_input else op_node.schema.outputs
                params_len = len(params)
                if param_idx >= params_len:
                    # this is a variadic parameter. Then the last parameter of the parameter must be variadic.
                    if params[-1].param_type != ONNXParameterType.Variadic:
                        raise ValueError(
                            "Expected the last {i_or_o} parameter to be variadic,"
                            " since the {i_or_o} with idx {param_idx} has more parameters than the schema ({params_len})"
                            .format(i_or_o="input" if is_input else "output",
                                    param_idx=param_idx,
                                    params_len=params_len))
                    conn_name = params[-1].name + "__" + str(param_idx -
                                                             params_len + 1)
                elif params[
                        param_idx].param_type == ONNXParameterType.Variadic:
                    # this is a variadic parameter, and it is within the range of params, so it must be the first
                    # instance of a variadic parameter
                    conn_name = params[param_idx].name + "__0"
                else:
                    conn_name = params[param_idx].name

                data_desc = self.sdfg.arrays[clean_onnx_name(name)]

                # add the connector if required, and add an edge
                if is_input:
                    if conn_name not in op_node.in_connectors:
                        op_node.add_in_connector(conn_name)
                    self.state.add_edge(
                        access, None, op_node, conn_name,
                        dace.Memlet.from_array(clean_onnx_name(name),
                                               data_desc))
                else:
                    if conn_name not in op_node.out_connectors:
                        op_node.add_out_connector(conn_name)

                    self.state.add_edge(
                        op_node, conn_name, access, None,
                        dace.Memlet.from_array(clean_onnx_name(name),
                                               data_desc))

        if self.cuda:
            self.sdfg.apply_strict_transformations()
            self.sdfg.apply_gpu_transformations()
            self.sdfg.apply_strict_transformations()

            # set all gpu transients to be persistent
            for _, _, arr in self.sdfg.arrays_recursive():
                if arr.transient and arr.storage == StorageType.GPU_Global:
                    arr.lifetime = AllocationLifetime.Persistent

    @staticmethod
    def _update_access_type(node: dace.nodes.AccessNode, is_input: bool):
        if node.access == AccessType.ReadOnly and not is_input:
            node.access = AccessType.ReadWrite
        elif node.access == AccessType.WriteOnly and is_input:
            node.access = AccessType.ReadWrite

    def _add_constant_tensor(self, tensor: onnx.TensorProto):
        if not tensor.HasField("name"):
            raise ValueError("Got tensor without name")

        if not tensor.HasField("data_type"):
            raise ValueError("Initializer tensor '{}' has no type".format(
                tensor.name))

        name = clean_onnx_name(tensor.name)

        dtype = onnx_tensor_type_to_typeclass(tensor.data_type)

        if len(tensor.dims) == 0:
            # this is a scalar
            self.sdfg.add_scalar(name, dtype)
        else:
            dims = [d for d in tensor.dims]
            if name not in self.sdfg.arrays:
                self.sdfg.add_array(name, dims, dtype)
            else:
                existing_arr = self.sdfg.arrays[name]
                if existing_arr.dtype != dtype:
                    raise ValueError(
                        "Invalid ONNX model; found two values with name '{}', but different dtypes ({} and {})"
                        .format(name, existing_arr.dtype, dtype))
                if tuple(existing_arr.shape) != tuple(dims):
                    raise ValueError(
                        "Invalid ONNX model; found two values with name '{}', but different dimensions ({} and {})"
                        .format(name, existing_arr.shape, dims))

        self.weights[tensor.name] = numpy_helper.to_array(tensor)

    def _add_value_info(self, value_info: onnx.ValueInfoProto):
        if not value_info.HasField("name"):
            raise ValueError("Got value without name")

        name = value_info.name

        if not _nested_HasField(value_info, "type.tensor_type.shape"):
            raise ValueError(
                "Value '{}' does not have a shape in this graph."
                " Please run shape inference before importing.".format(name))

        tensor_type = value_info.type.tensor_type

        if not tensor_type.HasField("elem_type"):
            raise ValueError(
                "Value '{}' does not have a type in this graph."
                " Please run type inference before importing.".format(name))

        shape = []
        for d in tensor_type.shape.dim:
            if d.HasField("dim_value"):
                shape.append(d.dim_value)
            elif d.HasField("dim_param"):
                parsed = pystr_to_symbolic(d.dim_param)

                for sym in parsed.free_symbols:
                    if clean_onnx_name(str(sym)) not in self.sdfg.symbols:
                        self.sdfg.add_symbol(clean_onnx_name(str(sym)),
                                             stype=int)
                    parsed = parsed.subs(
                        sym, dace.symbol(clean_onnx_name(str(sym))))

                shape.append(parsed)
            else:
                raise ValueError(
                    "Value '{}' does not have a shape in this graph."
                    " Please run shape inference before importing.".format(
                        name))
        transient = name not in self.inputs and name not in self.outputs
        if len(shape) == 0:
            self.sdfg.add_scalar(clean_onnx_name(name),
                                 dtype=onnx_tensor_type_to_typeclass(
                                     tensor_type.elem_type),
                                 transient=transient)
        else:
            self.sdfg.add_array(clean_onnx_name(name),
                                shape=shape,
                                dtype=onnx_tensor_type_to_typeclass(
                                    tensor_type.elem_type),
                                transient=transient)

    def __call__(self, *args, **inputs):
        sdfg = deepcopy(self.sdfg)

        # convert the positional args to kwargs
        if len(args) > len(self.inputs):
            raise ValueError("Expected {} arguments, got {}".format(
                len(self.inputs), len(args)))

        inputs.update(dict(zip(self.inputs, args)))

        # check that there are no missing inputs
        if len(set(self.inputs).difference(inputs)) != 0:
            raise ValueError("Missing inputs {}".format(", ".join(
                set(self.inputs).difference(inputs))))

        # check that there are no unknown inputs
        # NOTE symbols can only be passed as kwargs
        if len(
                set(inputs).difference(self.inputs).difference(
                    sdfg.free_symbols)) != 0:
            raise ValueError("Unknown inputs {}".format(", ".join(
                set(inputs).difference(self.inputs))))

        clean_inputs = {}
        for input, arr in inputs.items():
            if input in sdfg.free_symbols:
                clean_inputs[input] = arr
            else:
                clean_inputs[clean_onnx_name(input)] = arr

        # add the weights
        params = {}
        for name, arr in self.weights.items():
            if len(arr.shape) == 0:
                params[clean_onnx_name(name)] = arr[()]
            else:
                if self.cuda:
                    clean_name = clean_onnx_name(name)
                    sdfg.arrays[clean_name].storage = StorageType.GPU_Global
                    params[clean_name] = numba.cuda.to_device(arr)
                else:
                    params[clean_onnx_name(name)] = arr.copy()

        inferred_symbols = infer_symbols_from_shapes(sdfg, {
            **clean_inputs,
            **params
        })
        # TODO @orausch if this is removed the SDFG complains
        # TypeError: Type mismatch for argument ONNX_unk__493: expected scalar type, got <class 'sympy.core.numbers.Integer'>
        # fix this better
        inferred_symbols = {k: int(v) for k, v in inferred_symbols.items()}

        def eval_dim(dim):
            for sym in dim.free_symbols:
                dim = dim.subs(sym, inferred_symbols[sym.name])
            return dim

        outputs = OrderedDict()
        # create numpy arrays for the outputs
        for output in self.outputs:
            clean_name = clean_onnx_name(output)
            arr = sdfg.arrays[clean_name]

            # TODO @orausch add error handling for evalf
            shape = [
                eval_dim(d) if type(d) is dace.symbol else d for d in arr.shape
            ]
            outputs[clean_name] = np.empty(shape,
                                           dtype=arr.dtype.as_numpy_dtype())

        sdfg.expand_library_nodes()
        #sdfg.apply_strict_transformations()

        sdfg(**clean_inputs, **params, **outputs, **inferred_symbols)

        if len(outputs) == 1:
            return next(iter(outputs.values()))

        return tuple(outputs.values())
Пример #6
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)

        # If identity is defined, add an initialization state
        if node.identity is not None:
            init_state = nsdfg.add_state()
            nstate = nsdfg.add_state()
            nsdfg.add_edge(init_state, nstate, dace.InterstateEdge())

            # Add initialization as a map
            init_state.add_mapped_tasklet(
                'reduce_init', {
                    '_o%d' % i: '0:%s' % symstr(d)
                    for i, d in enumerate(outedge.data.subset.size())
                }, {},
                'out = %s' % node.identity, {
                    'out':
                    dace.Memlet.simple(
                        '_out', ','.join(
                            ['_o%d' % i for i in range(output_dims)]))
                },
                external_edges=True)
        else:
            nstate = nsdfg.add_state()
        # END OF INIT

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append('_i%d' % ictr)
                    ictr += 1
                else:
                    input_subset.append('_o%d' % octr)
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    '_o%d' % i: '0:%s' % symstr(sz)
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm = dace.Memlet.simple(
                '_out',
                ','.join(['_o%d' % i for i in range(output_dims)]),
                wcr_str=node.wcr)
            inmm = dace.Memlet.simple('_in', ','.join(input_subset))
        else:
            ome, omx = None, None
            outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr)
            inmm = dace.Memlet.simple(
                '_in', ','.join(['_i%d' % i for i in range(len(axes))]))

        # Add inner map, which corresponds to the range to reduce, containing
        # an identity tasklet
        ime, imx = nstate.add_map(
            'reduce_values', {
                '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis])
                for i, axis in enumerate(sorted(axes))
            })

        # Add identity tasklet for reduction
        t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp')

        # Connect everything
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')
        if ome:
            nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm)
        else:
            nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return nsdfg
Пример #7
0
    def expansion(node: 'Reduce',
                  state: SDFGState,
                  sdfg: SDFG,
                  partial_width=16):
        '''

        :param node: the node to expand
        :param state: the state in which the node is in
        :param sdfg: the SDFG in which the node is in
        :param partial_width: Width of the inner reduction buffer. Must be
                              larger than the latency of the reduction operation on the given
                              data type
        '''
        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)
        if input_data.dtype.veclen > 1:
            raise NotImplementedError(
                'Vectorization currently not implemented for FPGA expansion of Reduce.'
            )

        nstate = nsdfg.add_state()

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            all_axis = False
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append(f'_i{ictr}')
                    ictr += 1
                else:
                    input_subset.append(f'_o{octr}')
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    f'_o{i}': f'0:{symstr(sz)}'
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm_idx = ','.join([f'_o{i}' for i in range(output_dims)])
            outm = dace.Memlet(f'_out[{outm_idx}]')
            inm_idx = ','.join(input_subset)
            inmm = dace.Memlet(f'_in[{inm_idx}]')
        else:
            all_axis = True
            ome, omx = None, None
            outm = dace.Memlet('_out[0]')
            inm_idx = ','.join([f'_i{i}' for i in range(len(axes))])
            inmm = dace.Memlet(f'_in[{inm_idx}]')

        # Add inner map, which corresponds to the range to reduce
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')

        # TODO support vectorization
        buffer_name = 'partial_results'
        nsdfg.add_array(buffer_name, (partial_width, ),
                        input_data.dtype,
                        transient=True,
                        storage=dtypes.StorageType.FPGA_Local)
        buffer = nstate.add_access(buffer_name)
        buffer_write = nstate.add_write(buffer_name)

        # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width
        init_me, init_mx = nstate.add_map(
            'partial_results_init', {'i': f'0:{partial_width}'},
            schedule=dtypes.ScheduleType.FPGA_Device,
            unroll=True)
        init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'},
                                          f'pr_out = {node.identity}')
        nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet())
        nstate.add_memlet_path(init_tasklet,
                               init_mx,
                               buffer,
                               src_conn='pr_out',
                               memlet=dace.Memlet(f'{buffer_name}[i]'))

        if not all_axis:
            nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet())

        ime, imx = nstate.add_map(
            'reduce_values', {
                f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}'
                for i, axis in enumerate(sorted(axes))
            })

        # Accumulate over partial results
        redtype = detect_reduction_type(node.wcr)
        if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR:
            raise ValueError('Reduction type not supported for "%s"' % node.wcr)
        else:
            reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[
                redtype]

        # generate flatten index considering inner map: will be used for indexing into partial results
        ranges_size = ime.range.size()
        inner_index = '+'.join(
            [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)])
        inner_op = ' + ' if len(axes) > 1 else ''
        inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}'
        partial_reduce_tasklet = nstate.add_tasklet(
            'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\
prev = buffer_in
buffer_out = {reduction_expr}''')

        if not all_axis:
            # Connect input and partial sums
            nstate.add_memlet_path(r,
                                   ome,
                                   ime,
                                   partial_reduce_tasklet,
                                   dst_conn='data_in',
                                   memlet=inmm)
        else:
            nstate.add_memlet_path(r,
                                   ime,
                                   partial_reduce_tasklet,
                                   dst_conn='data_in',
                                   memlet=inmm)
        nstate.add_memlet_path(
            buffer,
            ime,
            partial_reduce_tasklet,
            dst_conn='buffer_in',
            memlet=dace.Memlet(
                f'{buffer_name}[({inner_index})%{partial_width}]'))
        nstate.add_memlet_path(
            partial_reduce_tasklet,
            imx,
            buffer_write,
            src_conn='buffer_out',
            memlet=dace.Memlet(
                f'{buffer_name}[({inner_index})%{partial_width}]'))

        # Then perform reduction on partial results
        reduce_entry, reduce_exit = nstate.add_map(
            'reduce', {'i': f'0:{partial_width}'},
            schedule=dtypes.ScheduleType.FPGA_Device,
            unroll=True)

        reduce_tasklet = nstate.add_tasklet(
            'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\
prev = reduce_in if i > 0 else {node.identity}
reduce_out = {reduction_expr}''')
        nstate.add_memlet_path(buffer_write,
                               reduce_entry,
                               reduce_tasklet,
                               dst_conn='data_in',
                               memlet=dace.Memlet(f'{buffer_name}[i]'))

        reduce_name = 'reduce_result'
        nsdfg.add_array(reduce_name, (1, ),
                        output_data.dtype,
                        transient=True,
                        storage=dtypes.StorageType.FPGA_Local)
        reduce_read = nstate.add_access(reduce_name)
        reduce_access = nstate.add_access(reduce_name)

        if not all_axis:
            nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet())

        nstate.add_memlet_path(reduce_read,
                               reduce_entry,
                               reduce_tasklet,
                               dst_conn='reduce_in',
                               memlet=dace.Memlet(f'{reduce_name}[0]'))
        nstate.add_memlet_path(reduce_tasklet,
                               reduce_exit,
                               reduce_access,
                               src_conn='reduce_out',
                               memlet=dace.Memlet(f'{reduce_name}[0]'))

        if not all_axis:
            # Write out the result
            nstate.add_memlet_path(reduce_access, omx, w, memlet=outm)
        else:
            nstate.add_memlet_path(reduce_access, w, memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')
        nsdfg.validate()

        return nsdfg
Пример #8
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        insubset = dcpy(inedge.data.subset)
        isqdim = insubset.squeeze()
        outsubset = dcpy(outedge.data.subset)
        osqdim = outsubset.squeeze()
        input_dims = len(insubset)
        output_dims = len(outsubset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        if len(osqdim) == 0:  # Fix for scalars
            osqdim = [0]

        # Standardize and squeeze axes
        axes = node.axes if node.axes else [
            i for i in range(len(inedge.data.subset))
        ]
        axes = [axis for axis in axes if axis in isqdim]

        assert node.identity is not None

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        insubset.size(),
                        input_data.dtype,
                        strides=[
                            s for i, s in enumerate(input_data.strides)
                            if i in isqdim
                        ],
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outsubset.size(),
                        output_data.dtype,
                        strides=[
                            s for i, s in enumerate(output_data.strides)
                            if i in osqdim
                        ],
                        storage=output_data.storage)

        nsdfg.add_transient('acc', [1], nsdfg.arrays['_in'].dtype,
                            dtypes.StorageType.Register)

        nstate = nsdfg.add_state()

        # Interleave input and output axes to match input memlet
        ictr, octr = 0, 0
        input_subset = []
        for i in isqdim:
            if i in axes:
                input_subset.append('_i%d' % ictr)
                ictr += 1
            else:
                input_subset.append('_o%d' % octr)
                octr += 1

        ome, omx = nstate.add_map(
            'reduce_output', {
                '_o%d' % i: '0:%s' % symstr(sz)
                for i, sz in enumerate(outsubset.size())
            })
        outm = dace.Memlet.simple(
            '_out', ','.join(['_o%d' % i for i in range(output_dims)]))
        #wcr_str=node.wcr)
        inmm = dace.Memlet.simple('_in', ','.join(input_subset))

        idt = nstate.add_tasklet('reset', {}, {'o'}, f'o = {node.identity}')
        nstate.add_edge(ome, None, idt, None, dace.Memlet())

        accread = nstate.add_access('acc')
        accwrite = nstate.add_access('acc')
        nstate.add_edge(idt, 'o', accread, None, dace.Memlet('acc'))

        # Add inner map, which corresponds to the range to reduce, containing
        # an identity tasklet
        ime, imx = nstate.add_map('reduce_values', {
            '_i%d' % i: '0:%s' % symstr(insubset.size()[isqdim.index(axis)])
            for i, axis in enumerate(sorted(axes))
        },
                                  schedule=dtypes.ScheduleType.Sequential)

        # Add identity tasklet for reduction
        t = nstate.add_tasklet('identity', {'a', 'b'}, {'o'}, 'o = b')

        # Connect everything
        r = nstate.add_read('_in')
        w = nstate.add_write('_out')
        nstate.add_memlet_path(r, ome, ime, t, dst_conn='b', memlet=inmm)
        nstate.add_memlet_path(accread,
                               ime,
                               t,
                               dst_conn='a',
                               memlet=dace.Memlet('acc[0]'))
        nstate.add_memlet_path(t,
                               imx,
                               accwrite,
                               src_conn='o',
                               memlet=dace.Memlet('acc[0]', wcr=node.wcr))
        nstate.add_memlet_path(accwrite, omx, w, memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        from dace.transformation import dataflow
        nsdfg.apply_transformations_repeated(dataflow.MapCollapse)

        return nsdfg
Пример #9
0
    def _expand_reduce(self, sdfg, state, node):
        # expands a reduce into two nested maps
        # taken from legacy expand_reduce.py

        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)

        if node.identity is not None:
            raise ValueError("Node identity has to be None at this point.")
        else:
            nstate = nsdfg.add_state()
        # END OF INIT

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append('_i%d' % ictr)
                    ictr += 1
                else:
                    input_subset.append('_o%d' % octr)
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    '_o%d' % i: '0:%s' % symstr(sz)
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm = Memlet.simple('_out',
                                 ','.join(
                                     ['_o%d' % i for i in range(output_dims)]),
                                 wcr_str=node.wcr)
            inmm = Memlet.simple('_in', ','.join(input_subset))
        else:
            ome, omx = None, None
            outm = Memlet.simple('_out', '0', wcr_str=node.wcr)
            inmm = Memlet.simple(
                '_in', ','.join(['_i%d' % i for i in range(len(axes))]))

        # Add inner map, which corresponds to the range to reduce, containing
        # an identity tasklet
        ime, imx = nstate.add_map(
            'reduce_values', {
                '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis])
                for i, axis in enumerate(sorted(axes))
            })

        # Add identity tasklet for reduction
        t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp')

        # Connect everything
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')

        if ome:
            nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm)
        else:
            nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        nsdfg = state.add_nested_sdfg(nsdfg,
                                      sdfg,
                                      node.in_connectors,
                                      node.out_connectors,
                                      schedule=node.schedule,
                                      name=node.name)

        utils.change_edge_dest(state, node, nsdfg)
        utils.change_edge_src(state, node, nsdfg)
        state.remove_node(node)

        return nsdfg
Пример #10
0
def generate_reference(name, chain):
    """Generates a simple, unoptimized SDFG to run on the CPU, for verification
       purposes."""

    sdfg = SDFG(name)

    for k, v in chain.constants.items():
        sdfg.add_constant(k, v["value"], dace.data.Scalar(v["data_type"]))

    (dimensions_to_skip, shape, vector_length, parameters, iterators,
     memcopy_indices, memcopy_accesses) = _generate_init(chain)

    prev_state = sdfg.add_state("init")

    # Throw vectorization in the bin for the reference code
    vector_length = 1

    shape = tuple(map(int, shape))

    input_shapes = {}  # Maps inputs to their shape tuple

    for node in chain.graph.nodes():
        if isinstance(node, Input) or isinstance(node, Output):
            if isinstance(node, Input):
                for output in node.outputs.values():
                    pars = tuple(
                        output["input_dims"]
                    ) if "input_dims" in output and output[
                        "input_dims"] is not None else tuple(parameters)
                    arr_shape = tuple(s for s, p in zip(shape, parameters)
                                      if p in pars)
                    input_shapes[node.name] = arr_shape
                    break
                else:
                    raise ValueError("No outputs found for input node.")
            else:
                arr_shape = shape
            if len(arr_shape) > 0:
                try:
                    sdfg.add_array(node.name, arr_shape, node.data_type)
                except NameError:
                    sdfg.data(
                        node.name).access = dace.dtypes.AccessType.ReadWrite
            else:
                sdfg.add_symbol(node.name, node.data_type)

    for link in chain.graph.edges(data=True):
        name = link[0].name
        if name not in sdfg.arrays and name not in sdfg.symbols:
            sdfg.add_array(name, shape, link[0].data_type, transient=True)
            input_shapes[name] = tuple(shape)

    input_iterators = {
        k: tuple("0:{}".format(s) for s in v)
        for k, v in input_shapes.items()
    }

    # Enforce dependencies via topological sort
    for node in nx.topological_sort(chain.graph):

        if not isinstance(node, Kernel):
            continue

        state = sdfg.add_state(node.name)
        sdfg.add_edge(prev_state, state, dace.InterstateEdge())

        (stencil_node, input_to_connector,
         output_to_connector) = _generate_stencil(node, chain, shape,
                                                  dimensions_to_skip)
        stencil_node.implementation = "CPU"

        for field, connector in input_to_connector.items():

            if len(input_iterators[field]) == 0:
                continue  # Scalar variable

            # Outer memory read
            read_node = state.add_read(field)
            state.add_memlet_path(read_node,
                                  stencil_node,
                                  dst_conn=connector,
                                  memlet=Memlet.simple(
                                      field,
                                      ", ".join(input_iterators[field])))

        for _, connector in output_to_connector.items():

            # Outer write
            write_node = state.add_write(node.name)
            state.add_memlet_path(stencil_node,
                                  write_node,
                                  src_conn=connector,
                                  memlet=Memlet.simple(
                                      node.name, ", ".join("0:{}".format(s)
                                                           for s in shape)))

        prev_state = state

    return sdfg