import copy import dace import dace.graph.nodes import numpy as np # Python version of the SDFG below # @dace.program # def reduce_with_strides(A: dace.float64[50, 50], B: dace.float64[25]): # B[:] = dace.reduce(lambda a,b: a+b, A[::2, ::2], axis=0, # identity=0) reduce_with_strides = dace.SDFG('reduce_with_strides') reduce_with_strides.add_array('A', [50, 50], dace.float64) reduce_with_strides.add_array('B', [25], dace.float64) state = reduce_with_strides.add_state() node_a = state.add_read('A') node_b = state.add_write('B') red = state.add_reduce('lambda a,b: a+b', [0], 0) state.add_nedge(node_a, red, dace.Memlet.simple('A', '0:50:2, 0:50:2')) state.add_nedge(red, node_b, dace.Memlet.simple('B', '0:25')) def test_strided_reduce(): A = np.random.rand(50, 50) B = np.random.rand(25) sdfg = copy.deepcopy(reduce_with_strides) sdfg(A=A, B=B) assert np.allclose(B, np.sum(A[::2, ::2], axis=0))
def _make_sdfg(name, storage=dace.dtypes.StorageType.CPU_Heap, isview=False): N = dace.symbol('N', dtype=dace.int32, integer=True, positive=True) i = dace.symbol('i', dtype=dace.int32, integer=True) sdfg = dace.SDFG(name) _, A = sdfg.add_array('A', [N, N, N], dtype=dace.float64) _, B = sdfg.add_array('B', [N], dtype=dace.float64) if isview: _, tmp1 = sdfg.add_view('tmp1', [N - 4, N - 4, N - i], dtype=dace.float64, storage=storage, strides=A.strides) else: _, tmp1 = sdfg.add_transient('tmp1', [N - 4, N - 4, N - i], dtype=dace.float64, storage=storage) _, tmp2 = sdfg.add_transient('tmp2', [1], dtype=dace.float64, storage=storage) begin_state = sdfg.add_state("begin", is_start_state=True) guard_state = sdfg.add_state("guard") body1_state = sdfg.add_state("body1") body2_state = sdfg.add_state("body2") body3_state = sdfg.add_state("body3") end_state = sdfg.add_state("end") sdfg.add_edge(begin_state, guard_state, dace.InterstateEdge(assignments=dict(i='0'))) sdfg.add_edge(guard_state, body1_state, dace.InterstateEdge(condition=f'i<{N}')) sdfg.add_edge(guard_state, end_state, dace.InterstateEdge(condition=f'i>={N}')) sdfg.add_edge(body1_state, body2_state, dace.InterstateEdge()) sdfg.add_edge(body2_state, body3_state, dace.InterstateEdge()) sdfg.add_edge(body3_state, guard_state, dace.InterstateEdge(assignments=dict(i='i+1'))) if not isview: read_a = body1_state.add_read('A') write_tmp1 = body1_state.add_write('tmp1') body1_state.add_nedge(read_a, write_tmp1, dace.Memlet(f'A[2:{N}-2, 2:{N}-2, i:{N}]')) if isview: read_a = body2_state.add_read('A') read_tmp1 = body2_state.add_access('tmp1') body2_state.add_nedge(read_a, read_tmp1, dace.Memlet(f'A[2:{N}-2, 2:{N}-2, i:{N}]')) else: read_tmp1 = body2_state.add_read('tmp1') rednode = standard.Reduce(wcr='lambda a, b : a + b', identity=0) if storage == dace.dtypes.StorageType.GPU_Global: rednode.implementation = 'CUDA (device)' elif storage == dace.dtypes.StorageType.FPGA_Global: rednode.implementation = 'FPGAPartialReduction' body2_state.add_node(rednode) write_tmp2 = body2_state.add_write('tmp2') body2_state.add_nedge(read_tmp1, rednode, dace.Memlet.from_array('tmp1', tmp1)) body2_state.add_nedge(rednode, write_tmp2, dace.Memlet('tmp2[0]')) read_tmp2 = body3_state.add_read('tmp2') write_b = body3_state.add_write('B') body3_state.add_nedge(read_tmp2, write_b, dace.Memlet('B[i]')) return sdfg
def get_property_metdata(): """ Generate a dictionary of class properties and their metadata. This iterates over all classes registered as serializable in DaCe's serialization module, checks whether there are properties present (true for any class registered via the @make.properties decorator), and then assembels their metadata to a dictionary. """ # Lazy import to cut down on module load time. from dace.sdfg.nodes import full_class_path # In order to get all transformation metadata the @make.properties # annotation for each transformation needs to have run, so the # transformations are registered in `dace.serialize._DACE_SERIALIZE_TYPES`. # The simplest way to achieve this is by simply getting all pattern matches # of a dummy SDFG. Since this code should only be run once per SDFG editor, # this doesn't add any continuous overhead like it would if we were to # send transformation metadata along with `get_transformations`. from dace.transformation import optimizer _ = optimizer.Optimizer(dace.SDFG('dummy')).get_pattern_matches() meta_dict = {} meta_dict['__reverse_type_lookup__'] = {} meta_dict['__libs__'] = {} for typename in dace.serialize._DACE_SERIALIZE_TYPES: t = dace.serialize._DACE_SERIALIZE_TYPES[typename] if hasattr(t, '__properties__'): meta_key = typename if (issubclass(t, dace.sdfg.nodes.LibraryNode) and not t == dace.sdfg.nodes.LibraryNode): meta_key = full_class_path(t) meta_dict[meta_key] = {} libnode_implementations = None if hasattr(t, 'implementations'): libnode_implementations = list(t.implementations.keys()) for propname, prop in t.__properties__.items(): meta_dict[meta_key][propname] = prop.meta_to_json(prop) if hasattr(prop, 'key_type') and hasattr(prop, 'value_type'): # For dictionary properties, add their key and value types. meta_dict[meta_key][propname][ 'key_type'] = prop.key_type.__name__ meta_dict[meta_key][propname][ 'value_type'] = prop.value_type.__name__ elif hasattr(prop, 'element_type'): meta_dict[meta_key][propname][ 'element_type'] = prop.element_type.__name__ if prop.choices is not None: # If there are specific choices for this property (i.e. this # property is an enum), list those as metadata as well. if inspect.isclass(prop.choices): if issubclass(prop.choices, aenum.Enum): choices = [] for choice in prop.choices: choice_short = str(choice).split('.')[-1] if choice_short != 'Undefined': choices.append(choice_short) meta_dict[meta_key][propname]['choices'] = choices elif (propname == 'implementation' and libnode_implementations is not None): # For implementation properties, add all library # implementations as choices. meta_dict[meta_key][propname][ 'choices'] = libnode_implementations # Create a reverse lookup method for each meta type. This allows # us to get meta information about things other than properties # contained in some SDFG properties (types, CodeBlocks, etc.). if meta_dict[meta_key][propname]['metatype']: meta_type = meta_dict[meta_key][propname]['metatype'] if not meta_type in meta_dict['__reverse_type_lookup__']: meta_dict['__reverse_type_lookup__'][ meta_type] = meta_dict[meta_key][propname] # For library nodes we want to make sure they are all easily # accessible under '__libs__', to be able to list them all out. if (issubclass(t, dace.sdfg.nodes.LibraryNode) and not t == dace.sdfg.nodes.LibraryNode): meta_dict['__libs__'][typename] = meta_key # Save a lookup for enum values not present yet. enum_list = [ typename for typename, dtype in inspect.getmembers(dace.dtypes, inspect.isclass) if issubclass(dtype, aenum.Enum) ] for enum_name in enum_list: if not enum_name in meta_dict['__reverse_type_lookup__']: choices = [] for choice in getattr(dace.dtypes, enum_name): choice_short = str(choice).split('.')[-1] if choice_short != 'Undefined': choices.append(choice_short) meta_dict['__reverse_type_lookup__'][enum_name] = { 'category': 'General', 'metatype': enum_name, 'choices': choices, } return { 'metaDict': meta_dict, }
from __future__ import print_function import argparse import dace import numpy as np from typing import List import time # Define symbolic sizes for arbitrary inputs F = dace.symbol('F') G = dace.symbol('G') H = dace.symbol('H') sdfg = dace.SDFG('MatrixMultipy') # Define data type to use # Complex number, represented by two 64-bit floats (real and imaginary components) dtype = dace.complex128 np_dtype = np.complex128 # A = np.array([[1+2j,2+4j,3+6j], [4+2j,5+4j,6+6j]]) # B = np.array([[1+2j,1+4j,1+6j], [0+2j,1+4j,0+6j]]) # computing multiplication time on CPU tic = time.time() ##################################################################### # Data-centric functions # # Map-Reduce version of matrix multiplication
def make_backward_function(model: ONNXModel, apply_strict=False ) -> Type[torch.autograd.Function]: """ Convert an ONNXModel to a PyTorch differentiable function. This method should not be used on it's own. Instead use the ``backward=True`` parameter of :class:`daceml.pytorch.DaceModule`. :param model: the model to convert. :param apply_strict: whether to apply strict transformations before creating the backward pass. :return: the PyTorch compatible :class:`torch.autograd.Function`. """ if len(model.sdfg.nodes()) != 1: raise AutoDiffException( "Expected to find exactly one SDFGState, found {}".format( len(model.sdfg.nodes()))) forward_sdfg = model.sdfg forward_state = model.sdfg.nodes()[0] backward_sdfg = dace.SDFG(forward_sdfg.name + "_backward") backward_state = backward_sdfg.add_state() gen = BackwardPassGenerator( sdfg=forward_sdfg, state=forward_state, given_gradients=[clean_onnx_name(name) for name in model.outputs], required_gradients=[clean_onnx_name(name) for name in model.inputs], backward_sdfg=backward_sdfg, backward_state=backward_state, apply_strict=apply_strict) backward_result, backward_grad_arrays, backward_input_arrays = gen.backward( ) replaced_scalars = {} for name, desc in backward_input_arrays.items(): if name not in forward_sdfg.arrays: raise AutoDiffException( "Expected to find array with name '{}' in SDFG".format(name)) forward_desc = forward_sdfg.arrays[name] # we will save this output and pass it to the backward pass # Views should not be forwarded. Instead the backward pass generator should forward the source of the view, # and rebuild the sequence of required views in the backward pass. assert type(forward_desc) is not dt.View if isinstance(forward_desc, dt.Scalar): # we can't return scalars from SDFGs, so we add a copy to an array of size 1 fwd_arr_name, _ = forward_sdfg.add_array( name + "_array", [1], forward_desc.dtype, transient=False, storage=forward_desc.storage, find_new_name=True) bwd_arr_name, _ = backward_sdfg.add_array( name + "_array", [1], forward_desc.dtype, transient=False, storage=forward_desc.storage, find_new_name=True) backward_sdfg.arrays[name].transient = True fwd_copy_state = forward_sdfg.add_state_after(forward_state, label="copy_out_" + fwd_arr_name) bwd_copy_state = backward_sdfg.add_state_before(backward_state, label="copy_in_" + bwd_arr_name) fwd_copy_state.add_edge(fwd_copy_state.add_read(name), None, fwd_copy_state.add_write(fwd_arr_name), None, dace.Memlet(name + "[0]")) bwd_copy_state.add_edge(bwd_copy_state.add_read(bwd_arr_name), None, bwd_copy_state.add_write(name), None, dace.Memlet(name + "[0]")) replaced_scalars[name] = fwd_arr_name else: forward_sdfg.arrays[name].transient = False backward_sdfg.validate() class DaceFunction(torch.autograd.Function): _backward_sdfg = backward_sdfg _forward_model = model _backward_result = backward_result @staticmethod def forward(ctx, *inputs): # setup the intermediate buffers if any(not inp.is_contiguous() for inp in inputs): log.warning("forced to copy input since it was not contiguous") copied_inputs = tuple( inp if inp.is_contiguous else inp.contiguous() for inp in inputs) # prepare the arguments inputs, params, symbols, outputs = model._call_args( args=copied_inputs, kwargs={}) # create the empty tensors we need for the intermediate values for inp, val in backward_input_arrays.items(): if isinstance(val, dt.Scalar): # the value we need is actually in an array inp = replaced_scalars[inp] if inp not in inputs and inp not in outputs and inp not in params: inputs[inp] = create_output_array(symbols, forward_sdfg.arrays[inp], use_torch=True) DaceFunction._forward_model.sdfg(**inputs, **symbols, **params, **outputs) def _get_arr(name, desc): if isinstance(desc, dt.Scalar): name = replaced_scalars[name] if name in inputs: value = inputs[name] elif name in outputs: value = outputs[name] elif name in params: value = params[name] else: raise AutoDiffException( f"Could not get value of array {name}") return value # save the arrays we need for the backward pass backward_inputs = { name: _get_arr(name, desc) for name, desc in backward_input_arrays.items() } for name in replaced_scalars: backward_inputs[replaced_scalars[name]] = backward_inputs[name] del backward_inputs[name] ctx.dace_backward_inputs = backward_inputs ctx.dace_symbols = symbols if len(outputs) == 1: return next(iter(outputs.values())) return tuple(outputs.values()) @staticmethod def backward(ctx, *grads): backward_inputs = ctx.dace_backward_inputs if len(grads) != len(model.outputs): raise ValueError("Expected to receive {} grads, got {}".format( len(model.outputs), len(grads))) given_grads = dict( zip((DaceFunction._backward_result.given_grad_names[ clean_onnx_name(outp)] for outp in model.outputs), grads)) for name, value in given_grads.items(): if not isinstance(value, torch.Tensor): raise ValueError( "Unsupported input with type {};" " currently only tensor inputs are supported".format( type(value))) if not value.is_contiguous(): log.warning( "forced to copy input since it was not contiguous") given_grads[name] = value.contiguous() # these are the grads we will calculate input_grad_names = [ DaceFunction._backward_result.required_grad_names[ clean_onnx_name(inp)] for inp in itertools.chain(model.inputs) ] # init the grads we will calculate with zeros grad_values = OrderedDict() for name in input_grad_names: grad_values[name] = create_output_array( ctx.dace_symbols, backward_grad_arrays[name], use_torch=True, zeros=True) DaceFunction._backward_sdfg(**grad_values, **backward_inputs, **given_grads) return tuple(grad_values.values()) return DaceFunction
def mapfission_sdfg(): sdfg = dace.SDFG('mapfission') sdfg.add_array('A', [4], dace.float64) sdfg.add_array('B', [2], dace.float64) sdfg.add_scalar('scal', dace.float64, transient=True) sdfg.add_scalar('s1', dace.float64, transient=True) sdfg.add_transient('s2', [2], dace.float64) sdfg.add_transient('s3out', [1], dace.float64) state = sdfg.add_state() # Nodes rnode = state.add_read('A') ome, omx = state.add_map('outer', dict(i='0:2')) t1 = state.add_tasklet('one', {'a'}, {'b'}, 'b = a[0] + a[1]') ime2, imx2 = state.add_map('inner', dict(j='0:2')) t2 = state.add_tasklet('two', {'a'}, {'b'}, 'b = a * 2') s24node = state.add_access('s2') s34node = state.add_access('s3out') ime3, imx3 = state.add_map('inner', dict(j='0:2')) t3 = state.add_tasklet('three', {'a'}, {'b'}, 'b = a[0] * 3') scalar = state.add_tasklet('scalar', {}, {'out'}, 'out = 5.0') t4 = state.add_tasklet('four', {'ione', 'itwo', 'ithree', 'sc'}, {'out'}, 'out = ione + itwo[0] * itwo[1] + ithree + sc') wnode = state.add_write('B') # Edges state.add_nedge(ome, scalar, dace.Memlet()) state.add_memlet_path(rnode, ome, t1, memlet=dace.Memlet.simple('A', '2*i:2*i+2'), dst_conn='a') state.add_memlet_path(rnode, ome, ime2, t2, memlet=dace.Memlet.simple('A', '2*i+j'), dst_conn='a') state.add_memlet_path(t2, imx2, s24node, memlet=dace.Memlet.simple('s2', 'j'), src_conn='b') state.add_memlet_path(rnode, ome, ime3, t3, memlet=dace.Memlet.simple('A', '2*i:2*i+2'), dst_conn='a') state.add_memlet_path(t3, imx3, s34node, memlet=dace.Memlet.simple('s3out', '0'), src_conn='b') state.add_edge(t1, 'b', t4, 'ione', dace.Memlet.simple('s1', '0')) state.add_edge(s24node, None, t4, 'itwo', dace.Memlet.simple('s2', '0:2')) state.add_edge(s34node, None, t4, 'ithree', dace.Memlet.simple('s3out', '0')) state.add_edge(scalar, 'out', t4, 'sc', dace.Memlet.simple('scal', '0')) state.add_memlet_path(t4, omx, wnode, memlet=dace.Memlet.simple('B', 'i'), src_conn='out') sdfg.validate() return sdfg
import numpy as np import dace from dace.memlet import Memlet # Create SDFG sdfg = dace.SDFG('nested_reduction') state = sdfg.add_state('a') # Nodes A = state.add_array('A', (40, ), dace.float32) B = state.add_array('B', (20, ), dace.float32) me, mx = state.add_map('mymap', dict(i='0:20')) red = state.add_reduce('lambda a,b: a+b', None, 0) # Edges state.add_edge(A, None, me, None, Memlet.simple(A, '0:40')) state.add_edge(me, None, red, None, Memlet.simple(A, '(2*i):(2*i+2)')) state.add_edge(red, None, mx, None, Memlet.simple(B, 'i')) state.add_edge(mx, None, B, None, Memlet.simple(B, '0:20')) sdfg.fill_scope_connectors() if __name__ == '__main__': print('Nested reduction test') Adata = np.random.rand(40).astype(np.float32) Bdata = np.random.rand(20).astype(np.float32) sdfg(A=Adata, B=Bdata) B_regression = np.zeros(20, dtype=np.float32) B_regression[:] = Adata[::2] B_regression[:] += Adata[1::2]
def _make_sdfg_getrs(node, parent_state, parent_sdfg, implementation): arr_desc = node.validate(parent_sdfg, parent_state) (ain_shape, ain_dtype, ain_strides, bin_shape, bin_dtype, bin_strides, out_shape, out_dtype, out_strides, n, rhs) = arr_desc dtype = ain_dtype sdfg = dace.SDFG("{l}_sdfg".format(l=node.label)) ain_arr = sdfg.add_array('_ain', ain_shape, dtype=ain_dtype, strides=ain_strides) ainout_arr = sdfg.add_array('_ainout', [n, n], dtype=ain_dtype, transient=True) bin_arr = sdfg.add_array('_bin', bin_shape, dtype=bin_dtype, strides=bin_strides) binout_shape = [n, rhs] if implementation == 'cuSolverDn': binout_shape = [rhs, n] binout_arr = sdfg.add_array('_binout', binout_shape, dtype=out_dtype, transient=True) bout_arr = sdfg.add_array('_bout', out_shape, dtype=out_dtype, strides=out_strides) ipiv_arr = sdfg.add_array('_pivots', [n], dtype=dace.int32, transient=True) info_arr = sdfg.add_array('_info', [1], dtype=dace.int32, transient=True) state = sdfg.add_state("{l}_state".format(l=node.label)) getrf_node = Getrf('getrf') getrf_node.implementation = implementation getrs_node = Getrs('getrs') getrs_node.implementation = implementation ain = state.add_read('_ain') ainout1 = state.add_read('_ainout') ainout2 = state.add_access('_ainout') bin = state.add_read('_bin') binout1 = state.add_read('_binout') binout2 = state.add_read('_binout') bout = state.add_access('_bout') if implementation == 'cuSolverDn': transpose_ain = Transpose('AT', dtype=ain_dtype) transpose_ain.implementation = 'cuBLAS' state.add_edge(ain, None, transpose_ain, '_inp', Memlet.from_array(*ain_arr)) state.add_edge(transpose_ain, '_out', ainout1, None, Memlet.from_array(*ainout_arr)) transpose_bin = Transpose('bT', dtype=bin_dtype) transpose_bin.implementation = 'cuBLAS' state.add_edge(bin, None, transpose_bin, '_inp', Memlet.from_array(*bin_arr)) state.add_edge(transpose_bin, '_out', binout1, None, Memlet.from_array(*binout_arr)) transpose_out = Transpose('XT', dtype=bin_dtype) transpose_out.implementation = 'cuBLAS' state.add_edge(binout2, None, transpose_out, '_inp', Memlet.from_array(*binout_arr)) state.add_edge(transpose_out, '_out', bout, None, Memlet.from_array(*bout_arr)) else: state.add_nedge(ain, ainout1, Memlet.from_array(*ain_arr)) state.add_nedge(bin, binout1, Memlet.from_array(*bin_arr)) state.add_nedge(binout2, bout, Memlet.from_array(*bout_arr)) ipiv = state.add_access('_pivots') info1 = state.add_write('_info') info2 = state.add_write('_info') state.add_memlet_path(ainout1, getrf_node, dst_conn="_xin", memlet=Memlet.from_array(*ainout_arr)) state.add_memlet_path(getrf_node, info1, src_conn="_res", memlet=Memlet.from_array(*info_arr)) state.add_memlet_path(getrf_node, ipiv, src_conn="_ipiv", memlet=Memlet.from_array(*ipiv_arr)) state.add_memlet_path(getrf_node, ainout2, src_conn="_xout", memlet=Memlet.from_array(*ainout_arr)) state.add_memlet_path(ainout2, getrs_node, dst_conn="_a", memlet=Memlet.from_array(*ainout_arr)) state.add_memlet_path(binout1, getrs_node, dst_conn="_rhs_in", memlet=Memlet.from_array(*binout_arr)) state.add_memlet_path(ipiv, getrs_node, dst_conn="_ipiv", memlet=Memlet.from_array(*ipiv_arr)) state.add_memlet_path(getrs_node, info2, src_conn="_res", memlet=Memlet.from_array(*info_arr)) state.add_memlet_path(getrs_node, binout2, src_conn="_rhs_out", memlet=Memlet.from_array(*binout_arr)) return sdfg
import dace import numpy as np sr = dace.SDFG('stiledcopy') s0 = sr.add_state('s0') A = s0.add_array('A', [2, 16, 4], dace.float32) B = s0.add_array('B', [4], dace.float32) C = s0.add_array('C', [2, 16, 4], dace.float32) D = s0.add_array('D', [128, 128], dace.float32) E = s0.add_array('E', [8, 8], dace.float32) F = s0.add_array('F', [128, 128], dace.float32) # Reading A at [1, 0:8:8:2, 3] s0.add_nedge(A, B, dace.Memlet.simple(A, '1, 0:10:8:2, 3')) s0.add_nedge(B, C, dace.Memlet.simple(C, '1, 0:10:8:2, 3')) # Emulate a blocked tiled matrix multiplication pattern s0.add_nedge(D, E, dace.Memlet.simple(D, '8:76:64:4,4:72:64:4')) s0.add_nedge(E, F, dace.Memlet.simple(F, '8:76:64:4,4:72:64:4')) if __name__ == '__main__': print('Strided range copy tasklet test') A = np.random.rand(2, 16, 4).astype(np.float32) B = np.random.rand(4).astype(np.float32) C = np.random.rand(2, 16, 4).astype(np.float32) D = np.random.rand(128, 128).astype(np.float32) E = np.random.rand(8, 8).astype(np.float32) F = np.random.rand(128, 128).astype(np.float32)
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. # # This sample shows adding a constant integer value to a stream of integers. # # It is intended for running hardware_emulation or hardware xilinx targets. import dace import numpy as np # add symbol N = dace.symbol('N') # add sdfg sdfg = dace.SDFG('add_fortytwo') # add state state = sdfg.add_state('device_state') # add arrays sdfg.add_array('A', [N], dtype=dace.int32, storage=dace.StorageType.CPU_Heap) sdfg.add_array('B', [N], dtype=dace.int32, storage=dace.StorageType.CPU_Heap) sdfg.add_array('fpga_A', [N], dtype=dace.int32, transient=True, storage=dace.StorageType.FPGA_Global) sdfg.add_array('fpga_B', [N], dtype=dace.int32, transient=True, storage=dace.StorageType.FPGA_Global) # add streams
if os.name == 'nt': dp.Config.append('compiler', 'cpu', 'libs', value='cublas.lib') else: dp.Config.append('compiler', 'cpu', 'libs', value='libcublas.so') ###################################################################### # Create symbols M = dp.symbol('M') K = dp.symbol('K') N = dp.symbol('N') M.set(25) K.set(26) N.set(27) # Create a GPU SDFG with a custom C++ tasklet sdfg = dp.SDFG('cublastest') state = sdfg.add_state() # Add arrays sdfg.add_array('A', [M, K], dtype=dp.float64) sdfg.add_array('B', [K, N], dtype=dp.float64) sdfg.add_array('C', [M, N], dtype=dp.float64) # Add transient GPU arrays sdfg.add_transient('gA', [M, K], dp.float64, dp.StorageType.GPU_Global) sdfg.add_transient('gB', [K, N], dp.float64, dp.StorageType.GPU_Global) sdfg.add_transient('gC', [M, N], dp.float64, dp.StorageType.GPU_Global) # Add custom C++ tasklet to graph tasklet = state.add_tasklet( # Tasklet name (can be arbitrary)
def test_duplicate_codegen(): # Unfortunately I have to generate this graph manually, as doing it with the python # frontend wouldn't result in the node ordering that we want sdfg = dace.SDFG("dup") state = sdfg.add_state() c_task = state.add_tasklet("c_task", inputs={"c"}, outputs={"d"}, code='d = c') e_task = state.add_tasklet("e_task", inputs={"a", "d"}, outputs={"e"}, code="e = a + d") f_task = state.add_tasklet("f_task", inputs={"b", "d"}, outputs={"f"}, code="f = b + d") _, A_arr = sdfg.add_array("A", [ 1, ], dace.float32) _, B_arr = sdfg.add_array("B", [ 1, ], dace.float32) _, C_arr = sdfg.add_array("C", [ 1, ], dace.float32) _, D_arr = sdfg.add_array("D", [ 1, ], dace.float32) _, E_arr = sdfg.add_array("E", [ 1, ], dace.float32) _, F_arr = sdfg.add_array("F", [ 1, ], dace.float32) A = state.add_read("A") B = state.add_read("B") C = state.add_read("C") D = state.add_access("D") E = state.add_write("E") F = state.add_write("F") state.add_edge(C, None, c_task, "c", Memlet.from_array("C", C_arr)) state.add_edge(c_task, "d", D, None, Memlet.from_array("D", D_arr)) state.add_edge(A, None, e_task, "a", Memlet.from_array("A", A_arr)) state.add_edge(B, None, f_task, "b", Memlet.from_array("B", B_arr)) state.add_edge(D, None, f_task, "d", Memlet.from_array("D", D_arr)) state.add_edge(D, None, e_task, "d", Memlet.from_array("D", D_arr)) state.add_edge(e_task, "e", E, None, Memlet.from_array("E", E_arr, wcr="lambda x, y: x + y")) state.add_edge(f_task, "f", F, None, Memlet.from_array("F", F_arr, wcr="lambda x, y: x + y")) A = np.array([1], dtype=np.float32) B = np.array([1], dtype=np.float32) C = np.array([1], dtype=np.float32) D = np.array([1], dtype=np.float32) E = np.zeros_like(A) F = np.zeros_like(A) sdfg(A=A, B=B, C=C, D=D, E=E, F=F) assert E[0] == 2 assert F[0] == 2
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. """ Two sequential RTL tasklets connected through a memlet. """ import dace import argparse import numpy as np # add sdfg sdfg = dace.SDFG('rtl_multi_tasklet') # add state state = sdfg.add_state() # add arrays sdfg.add_array('A', [1], dtype=dace.int32) sdfg.add_array('B', [1], dtype=dace.int32) sdfg.add_array('C', [1], dtype=dace.int32) # add custom cpp tasklet tasklet0 = state.add_tasklet(name='rtl_tasklet0', inputs={'a'}, outputs={'b'}, code="""\ typedef enum [1:0] {READY, BUSY, DONE} state_e; state_e state; always@(posedge ap_aclk) begin if (ap_areset) begin // case: reset
def make_sdfg(): """ Creates three SDFG nested within each other, where two input arrays and two output arrays are fed throughout the hierarchy. One input and one output are not used for anything in the innermost SDFG, and can thus be removed in all nestings. """ n = dace.symbol("N") sdfg_outer = dace.SDFG("prune_connectors_test") sdfg_outer.set_global_code("#include <fstream>\n#include <mutex>") state_outer = sdfg_outer.add_state("state_outer") sdfg_outer.add_symbol("N", dace.int32) sdfg_middle = dace.SDFG("middle") sdfg_middle.add_symbol("N", dace.int32) nsdfg_middle = state_outer.add_nested_sdfg( sdfg_middle, sdfg_outer, {"read_used_middle", "read_unused_middle"}, {"write_used_middle", "write_unused_middle"}, name="middle") state_middle = sdfg_middle.add_state("middle") entry_middle, exit_middle = state_middle.add_map("map_middle", {"i": "0:N"}) sdfg_inner = dace.SDFG("inner") sdfg_inner.add_symbol("N", dace.int32) nsdfg_inner = state_middle.add_nested_sdfg( sdfg_inner, sdfg_middle, {"read_used_inner", "read_unused_inner"}, {"write_used_inner", "write_unused_inner"}, name="inner") state_inner = sdfg_inner.add_state("inner") entry_inner, exit_inner = state_inner.add_map("map_inner", {"j": "0:N"}) tasklet = state_inner.add_tasklet("tasklet", {"read_tasklet"}, {"write_tasklet"}, "write_tasklet = read_tasklet + 1") for s in ["unused", "used"]: # Read sdfg_outer.add_array(f"read_{s}", [n, n], dace.uint16) sdfg_outer.add_array(f"read_{s}_outer", [n, n], dace.uint16) sdfg_middle.add_array(f"read_{s}_middle", [n, n], dace.uint16) sdfg_inner.add_array(f"read_{s}_inner", [n], dace.uint16) read_outer = state_outer.add_read(f"read_{s}") read_middle = state_middle.add_read(f"read_{s}_middle") state_outer.add_memlet_path(read_outer, nsdfg_middle, dst_conn=f"read_{s}_middle", memlet=dace.Memlet(f"read_{s}[0:N, 0:N]")) state_middle.add_memlet_path( read_middle, entry_middle, nsdfg_inner, dst_conn=f"read_{s}_inner", memlet=dace.Memlet(f"read_{s}_middle[i, 0:N]")) # Write sdfg_outer.add_array(f"write_{s}", [n, n], dace.uint16) sdfg_outer.add_array(f"write_{s}_outer", [n, n], dace.uint16) sdfg_middle.add_array(f"write_{s}_middle", [n, n], dace.uint16) sdfg_inner.add_array(f"write_{s}_inner", [n], dace.uint16) write_outer = state_outer.add_write(f"write_{s}") write_middle = state_middle.add_write(f"write_{s}_middle") state_outer.add_memlet_path(nsdfg_middle, write_outer, src_conn=f"write_{s}_middle", memlet=dace.Memlet(f"write_{s}[0:N, 0:N]")) state_middle.add_memlet_path( nsdfg_inner, exit_middle, write_middle, src_conn=f"write_{s}_inner", memlet=dace.Memlet(f"write_{s}_middle[i, 0:N]")) read_inner = state_inner.add_read(f"read_used_inner") write_inner = state_inner.add_write(f"write_used_inner") state_inner.add_memlet_path(read_inner, entry_inner, tasklet, dst_conn=f"read_tasklet", memlet=dace.Memlet(f"read_{s}_inner[j]")) state_inner.add_memlet_path(tasklet, exit_inner, write_inner, src_conn=f"write_tasklet", memlet=dace.Memlet(f"write_{s}_inner[j]")) # Create mapped nested SDFG where the map entry and exit would be orphaned # by pruning the read and write, and must have nedges added to them isolated_read = state_outer.add_read("read_unused_outer") isolated_write = state_outer.add_write("write_unused_outer") isolated_sdfg = dace.SDFG("isolated_sdfg") isolated_nsdfg = state_outer.add_nested_sdfg(isolated_sdfg, sdfg_outer, {"read_unused_isolated"}, {"write_unused_isolated"}, name="isolated") isolated_sdfg.add_symbol("i", dace.int32) isolated_nsdfg.symbol_mapping["i"] = "i" isolated_entry, isolated_exit = state_outer.add_map( "isolated", {"i": "0:N"}) state_outer.add_memlet_path( isolated_read, isolated_entry, isolated_nsdfg, dst_conn="read_unused_isolated", memlet=dace.Memlet("read_unused_outer[0:N, 0:N]")) state_outer.add_memlet_path( isolated_nsdfg, isolated_exit, isolated_write, src_conn="write_unused_isolated", memlet=dace.Memlet("write_unused_outer[0:N, 0:N]")) isolated_state = isolated_sdfg.add_state("isolated") isolated_state.add_tasklet("isolated", {}, {}, """\ static std::mutex mutex; std::unique_lock<std::mutex> lock(mutex); std::ofstream of("prune_connectors_test.txt", std::ofstream::app); of << i << "\\n";""", language=dace.Language.CPP) return sdfg_outer
# Set up map that only has one exit _, me, mx = state.add_mapped_tasklet( 'boundary', dict(i='%s:%s' % (y0, y0 + height), j='%s:%s' % (x0, x0 + width)), {}, '''b = %f''' % initval, dict(b=dace.Memlet.simple(B.data, 'i,j')), external_edges=False) state.add_nedge( mx, B, dace.Memlet.simple(B.data, '%s:%s, %s:%s' % (y0, y0 + height, x0, x0 + width))) ################# sdfg = dace.SDFG('stencilboundaries') # Add arrays and kernel sdfg.add_array('A', [H, W], dace.float32) sdfg.add_array('B', [H, W], dace.float32) sdfg.add_constants({'KERNEL': STENCIL_KERNEL}) mainstate = sdfg.add_state() # The 7x7 stencil _, me, mx = mainstate.add_mapped_tasklet( 'stencil', dict(i='3:H-3', j='3:W-3'), dict(a=dace.Memlet.simple('A', 'i-3:i+4, j-3:j+4')), ''' b = 0
def make_sdfg(implementation, dtype, storage=dace.StorageType.Default, data_layout='CCC'): m = dace.symbol("m") n = dace.symbol("n") k = dace.symbol("k") suffix = "_device" if storage != dace.StorageType.Default else "" transient = storage != dace.StorageType.Default sdfg = dace.SDFG("mm_{}_{}".format(dtype.type.__name__, data_layout)) state = sdfg.add_state("dataflow") # Data layout is a 3-character string with either C (for row major) # or F (for column major) matrices for x, y, and z respectively. xstrides = (k, 1) if data_layout[0] == 'C' else (1, m) ystrides = (n, 1) if data_layout[1] == 'C' else (1, k) zstrides = (n, 1) if data_layout[2] == 'C' else (1, m) sdfg.add_array("x" + suffix, [m, k], dtype, storage=storage, transient=transient, strides=xstrides) sdfg.add_array("y" + suffix, [k, n], dtype, storage=storage, transient=transient, strides=ystrides) sdfg.add_array("result" + suffix, [m, n], dtype, storage=storage, transient=transient, strides=zstrides) x = state.add_read("x" + suffix) y = state.add_read("y" + suffix) result = state.add_write("result" + suffix) node = blas.nodes.matmul.MatMul("matmul", dtype) state.add_memlet_path(x, node, dst_conn="_a", memlet=Memlet.simple(x, "0:m, 0:k")) state.add_memlet_path(y, node, dst_conn="_b", memlet=Memlet.simple(y, "0:k, 0:n")) state.add_memlet_path(node, result, src_conn="_c", memlet=Memlet.simple(result, "0:m, 0:n")) if storage != dace.StorageType.Default: sdfg.add_array("x", [m, k], dtype) sdfg.add_array("y", [k, n], dtype) sdfg.add_array("result", [m, n], dtype) init_state = sdfg.add_state("copy_to_device") sdfg.add_edge(init_state, state, dace.InterstateEdge()) x_host = init_state.add_read("x") y_host = init_state.add_read("y") x_device = init_state.add_write("x" + suffix) y_device = init_state.add_write("y" + suffix) init_state.add_memlet_path(x_host, x_device, memlet=Memlet.simple(x_host, "0:m, 0:k")) init_state.add_memlet_path(y_host, y_device, memlet=Memlet.simple(y_host, "0:k, 0:n")) finalize_state = sdfg.add_state("copy_to_host") sdfg.add_edge(state, finalize_state, dace.InterstateEdge()) result_device = finalize_state.add_write("result" + suffix) result_host = finalize_state.add_read("result") finalize_state.add_memlet_path(result_device, result_host, memlet=Memlet.simple( result_device, "0:m, 0:n")) return sdfg
import dace import numpy as np sdfg = dace.SDFG('addedgepair') state = sdfg.add_state() # Add nodes t = state.add_tasklet('do', {'a'}, {'b'}, 'b = 2*a') a = state.add_array('A', [31], dace.float64) b = state.add_array('B', [1], dace.float64) me, mx = state.add_map('m', dict(i='0:31')) # Add edges state.add_edge_pair(me, t, a, dace.Memlet.simple(a, 'i'), internal_connector='a') state.add_edge_pair(mx, t, b, dace.Memlet.simple(b, '0', wcr_str='lambda a,b: a+b'), internal_connector='b', scope_connector='o') if __name__ == '__main__': A = np.random.rand(31).astype(np.float64) B = np.array([0.], dtype=np.float64) sdfg(A=A, B=B) diff = np.linalg.norm(B[0] - np.sum(2 * A))
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import copy import dace import dace.sdfg.nodes import numpy as np # Python version of the SDFG below # @dace.program # def reduce_with_offsets(A: dace.float64[50, 50], B: dace.float64[25]): # B[4:11] = dace.reduce(lambda a,b: a+b, A[25:50, 13:20], axis=0, # identity=0) reduce_with_offsets = dace.SDFG('reduce_with_offsets') reduce_with_offsets.add_array('A', [50, 50], dace.float64) reduce_with_offsets.add_array('B', [25], dace.float64) state = reduce_with_offsets.add_state() node_a = state.add_read('A') node_b = state.add_write('B') red = state.add_reduce('lambda a,b: a+b', [0], 0) state.add_nedge(node_a, red, dace.Memlet.simple('A', '25:50, 13:20')) state.add_nedge(red, node_b, dace.Memlet.simple('B', '4:11')) def test_offset_reduce(): A = np.random.rand(50, 50) B = np.random.rand(25) sdfg = copy.deepcopy(reduce_with_offsets) sdfg(A=A, B=B)
def expansion(node, parent_state, parent_sdfg): sdfg = dace.SDFG(node.label + "_outer") state = sdfg.add_state(node.label + "_outer") shape = np.array(node.shape) parameters = np.array(["i", "j", "k"])[:len(shape)] # Find outer data descriptor field_dtype = {} for e in parent_state.in_edges(node): field = e.dst_conn if field in node.accesses: field_dtype[field] = parent_sdfg.data( dace.sdfg.find_input_arraynode(parent_state, e).data).dtype for e in parent_state.out_edges(node): field = e.src_conn if field in node.output_fields: field_dtype[field] = parent_sdfg.data( dace.sdfg.find_output_arraynode(parent_state, e).data).dtype ####################################################################### # Tasklet code generation ####################################################################### code = node.code.as_string # Replace relative indices with memlet names converter = SubscriptConverter() new_ast = converter.visit(ast.parse(code)) code = astunparse.unparse(new_ast) code_memlet_names = converter.names ####################################################################### # Implement boundary conditions ####################################################################### boundary_code = "" # Loop over each input for field_name, (iterators, accesses) in node.accesses.items(): if sum(iterators, 0) == 0: continue # Scalar input # Loop over each access to this data for indices in accesses: try: memlet_name = code_memlet_names[field_name][indices] except KeyError: import pdb pdb.set_trace() raise KeyError("Missing access in code: {}[{}]".format( field_name, ", ".join(map(str, indices)))) cond = [] # Loop over each index of this access for i, offset in enumerate(indices): if offset < 0: cond.append(parameters[i] + " < " + str(-offset)) elif offset > 0: cond.append(parameters[i] + " >= " + str(shape[i] - offset)) ctype = field_dtype[field_name] if len(cond) == 0: boundary_code += "{} = {}_in\n".format( memlet_name, memlet_name) else: bc = node.boundary_conditions[field_name] btype = bc["btype"] if btype == "copy": center_memlet = code_memlet_names[field_name][center] boundary_val = "_{}".format(center_memlet) elif btype == "constant": boundary_val = bc["value"] elif btype == "shrink": # We don't need to do anything here, it's up to the # user to not use the junk output boundary_val = JUNK_VAL pass else: raise ValueError( "Unsupported boundary condition type: {}".format( node.boundary_conditions[field_name]["btype"])) boundary_code += ("{} = {} if {} else {}_in\n".format( memlet_name, boundary_val, " or ".join(cond), memlet_name)) ####################################################################### # Write all output memlets ####################################################################### write_code = "\n".join("{}_out = {}".format( code_memlet_names[output][tuple( 0 for _ in range(len(shape)))], code_memlet_names[output][tuple( 0 for _ in range(len(shape)))], output) for output in node.output_fields) code = boundary_code + "\n" + code + "\n" + write_code input_memlets = sum( [ ["{}_in".format(c) for c in v.values()] for k, v in code_memlet_names.items() # Don't include scalar variables if k in node.accesses and sum(node.accesses[k][0], 0) > 0 ], []) output_memlets = sum( [["{}_out".format(c) for c in v.values()] for k, v in code_memlet_names.items() if k in node.output_fields], []) ####################################################################### # Create tasklet ####################################################################### tasklet = state.add_tasklet(node.label + "_compute", input_memlets, output_memlets, code, language=dace.dtypes.Language.Python) ####################################################################### # Build dataflow state ####################################################################### entry, exit = state.add_map( node.name + "_map", collections.OrderedDict((parameters[i], "0:" + str(shape[i])) for i in range(len(shape)))) for field in code_memlet_names: dtype = field_dtype[field] if field in node.accesses: read_node = state.add_read(field) input_dims = node.accesses[field][0] input_shape = tuple(s for s, v in zip(shape, input_dims) if v) data = sdfg.add_array(field, input_shape, dtype) field_parameters = tuple( p for p, v in zip(parameters, input_dims) if v) for indices, connector in code_memlet_names[field].items(): access_str = ", ".join( "{} + ({})".format(p, i) for p, i in zip(field_parameters, indices)) memlet = dace.Memlet.simple(field, access_str, num_accesses=-1) memlet.allow_oob = True state.add_memlet_path(read_node, entry, tasklet, dst_conn=connector + "_in", memlet=memlet) else: data = sdfg.add_array(field, shape, dtype) write_node = state.add_write(field) for indices, connector in code_memlet_names[field].items(): state.add_memlet_path(tasklet, exit, write_node, src_conn=connector + "_out", memlet=dace.Memlet.simple( field, ", ".join(parameters))) # Add scalars as symbols for field_name, (indices, accesses) in node.accesses.items(): if not any(indices): sdfg.add_symbol(field_name, parent_sdfg.symbols[field_name]) ####################################################################### sdfg.parent = parent_state sdfg._parent_sdfg = parent_sdfg # TODO: this should not be necessary return sdfg
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np sdfg = dace.SDFG('inline_nonsink_access_test') sdfg.add_array('A', [1], dace.float32) sdfg.add_array('B', [1], dace.float32) state = sdfg.add_state() A = state.add_access('A') B = state.add_access('B') B_out = state.add_write('B') t = state.add_tasklet('add', {'a', 'b'}, {'c'}, 'c = a + b') state.add_edge(A, None, t, 'a', dace.Memlet.simple('A', '0')) state.add_edge(B, None, t, 'b', dace.Memlet.simple('B', '0')) state.add_edge(t, 'c', B_out, None, dace.Memlet.simple('B', '0')) # Add nested SDFG nsdfg = dace.SDFG('nested_ina_test') nsdfg.add_array('C', [1], dace.float32) nsdfg.add_array('D', [1], dace.float32) nstate = nsdfg.add_state() t_init = nstate.add_tasklet('init', {}, {'o'}, 'o = 2') t_square = nstate.add_tasklet('square', {'i'}, {'o'}, 'o = i * i') t_cube = nstate.add_tasklet('cube', {'i'}, {'o'}, 'o = i * i * i') C = nstate.add_access('C') C2 = nstate.add_access('C') D = nstate.add_write('D')
def __init__(self, builder): self.buttons = [ { "image": "cursor.png", "type": "mouse", "tool": "Mouse" }, { "image": "delete.png", "type": "delete", "tool": "Delete" }, { "image": "array.png", "type": "node", "tool": "Array" }, { "image": "edge_thin.png", "type": "edge", "tool": "Memlet" }, { "image": "map.png", "type": "node", "tool": "Map" }, { "image": "tasklet.png", "type": "node", "tool": "Tasklet" }, { "image": "stream.png", "type": "node", "tool": "Stream" }, { "image": "stream_map.png", "type": "node", "tool": "Consume" }, { "image": "state.png", "type": "node", "tool": "State" }, { "image": "state_trans.png", "type": "edge", "tool": "State Transition" }, { "image": "edge_head_redir.png", "type": "edge_redir", "tool": "Head Redirection" }, { "image": "edge_tail_redir.png", "type": "edge_redir", "tool": "Tail Redirection" }, ] self.active_tool = None # an element of self.buttons self.builder = builder self.current_editing_script = "" self.sdfg_changed = False # Initialize the SDFG to a valid one. Otherwise, we need # to check in all the functions that use it if it is None. self.sdfg = dace.SDFG("newsdfg", OrderedDict(), {}) self.first_selected_node_for_edge = None self.first_selected_state_for_edge = None self.selected_edge_for_redir = None self.rendered_sdfg = RenderedGraph() self.sdfg_da = self.builder.get_object("sdfg_editor_da") self.rendered_sdfg.set_drawing_area(self.sdfg_da) plabel = self.builder.get_object("se_propertylabel") pgrid = self.builder.get_object("se_propertygrid") self.propren = PropertyRenderer(plabel, pgrid, self.OnSDFGUpdate) self.image_store = ImageStore() self.load_buttons() self.connect_signals()
# Copyright 2019-2020 ETH Zurich and the DaCe authors. All rights reserved. import dace as dp import numpy as np sdfg = dp.SDFG('fib_consume') state = sdfg.add_state('state') # Arrays initial_value = state.add_array('iv', [1], dp.int32) stream = state.add_stream('S', dp.int32, transient=True) stream_init = state.add_stream('S', dp.int32, transient=True) stream_out = state.add_stream('S', dp.int32, transient=True) output = state.add_array('res', [1], dp.float32) # Consume and tasklet consume_entry, consume_exit = state.add_consume('cons', ('p', '4')) tasklet = state.add_tasklet( 'fibonacci', {'s'}, {'sout', 'val'}, """ if s == 1: val = 1 elif s > 1: sout = s - 1 # Recurse by pushing smaller values sout = s - 2 """) # Edges state.add_nedge(initial_value, stream_init, dp.Memlet.from_array(stream_init.data, stream_init.desc(sdfg))) state.add_edge(stream, None, consume_entry, 'IN_stream', dp.Memlet.from_array(stream.data, stream.desc(sdfg))) state.add_edge(consume_entry, 'OUT_stream', tasklet, 's',
def test_nested_sdfg(): print('SDFG consecutive tasklet (nested SDFG) test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N], dp.int32) output = dp.ndarray([N], dp.int32) input[:] = dp.int32(5) output[:] = dp.int32(0) # Construct outer SDFG mysdfg = SDFG('ctasklet_nested_sdfg') state = mysdfg.add_state() A_ = state.add_array('A', [N], dp.int32) B_ = state.add_array('B', [N], dp.int32) # Construct inner SDFG nsdfg = dp.SDFG('ctasklet_nested_sdfg_inner') nstate = nsdfg.add_state() a = nstate.add_array('a', [N], dp.int32) b = nstate.add_array('b', [N], dp.int32) map_entry, map_exit = nstate.add_map('mymap', dict(i='0:N/2')) tasklet = nstate.add_tasklet('mytasklet', {'aa'}, {'bb'}, 'bb = 5*aa') nstate.add_memlet_path(a, map_entry, tasklet, dst_conn='aa', memlet=Memlet('a[k*N/2+i]')) tasklet2 = nstate.add_tasklet('mytasklet2', {'cc'}, {'dd'}, 'dd = 2*cc') nstate.add_edge(tasklet, 'bb', tasklet2, 'cc', Memlet()) nstate.add_memlet_path(tasklet2, map_exit, b, src_conn='dd', memlet=Memlet('b[k*N/2+i]')) # Add outer edges omap_entry, omap_exit = state.add_map('omap', dict(k='0:2')) nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'a'}, {'b'}) state.add_memlet_path(A_, omap_entry, nsdfg_node, dst_conn='a', memlet=Memlet('A[0:N]')) state.add_memlet_path(nsdfg_node, omap_exit, B_, src_conn='b', memlet=Memlet('B[0:N]')) mysdfg.validate() mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5 mysdfg.simplify() mysdfg(A=input, B=output, N=N) diff = np.linalg.norm(10 * input - output) / N.get() print("Difference:", diff) assert diff <= 1e-5
def apply(self, sdfg): # Retrieve map entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapToForLoop._map_entry]] map_exits = graph.exit_nodes(map_entry) loop_idx = map_entry.map.params[0] loop_from, loop_to, loop_step = map_entry.map.range[0] nested_sdfg = dace.SDFG(graph.label + '_' + map_entry.map.label) # Construct nested SDFG begin = nested_sdfg.add_state('begin') guard = nested_sdfg.add_state('guard') body = nested_sdfg.add_state('body') end = nested_sdfg.add_state('end') nested_sdfg.add_edge( begin, guard, edges.InterstateEdge(assignments={str(loop_idx): str(loop_from)})) nested_sdfg.add_edge( guard, body, edges.InterstateEdge(condition = str(loop_idx) + ' <= ' + \ str(loop_to)) ) nested_sdfg.add_edge( guard, end, edges.InterstateEdge(condition = str(loop_idx) + ' > ' + \ str(loop_to)) ) nested_sdfg.add_edge( body, guard, edges.InterstateEdge(assignments = {str(loop_idx): str(loop_idx) + \ ' + ' +str(loop_step)}) ) # Add map contents map_subgraph = graph.scope_subgraph(map_entry) for node in map_subgraph.nodes(): if node is not map_entry and node not in map_exits: body.add_node(node) for src, src_conn, dst, dst_conn, memlet in map_subgraph.edges(): if src is not map_entry and dst not in map_exits: body.add_edge(src, src_conn, dst, dst_conn, memlet) # Reconnect inputs nested_in_data_nodes = {} nested_in_connectors = {} nested_in_memlets = {} for i, edge in enumerate(graph.in_edges(map_entry)): src, src_conn, dst, dst_conn, memlet = edge data_label = '_in_' + memlet.data memdata = sdfg.arrays[memlet.data] if isinstance(memdata, data.Array): data_array = sdfg.add_array(data_label, memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ]) elif isinstance(memdata, data.Scalar): data_array = sdfg.add_scalar(data_label, memdata.dtype) else: raise NotImplementedError() data_node = nodes.AccessNode(data_label) body.add_node(data_node) nested_in_data_nodes.update({i: data_node}) nested_in_connectors.update({i: data_label}) nested_in_memlets.update({i: memlet}) for _, _, _, _, old_memlet in body.edges(): if old_memlet.data == memlet.data: old_memlet.data = data_label #body.add_edge(data_node, None, dst, dst_conn, memlet) # Reconnect outputs nested_out_data_nodes = {} nested_out_connectors = {} nested_out_memlets = {} for map_exit in map_exits: for i, edge in enumerate(graph.out_edges(map_exit)): src, src_conn, dst, dst_conn, memlet = edge data_label = '_out_' + memlet.data memdata = sdfg.arrays[memlet.data] if isinstance(memdata, data.Array): data_array = sdfg.add_array(data_label, memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ]) elif isinstance(memdata, data.Scalar): data_array = sdfg.add_scalar(data_label, memdata.dtype) else: raise NotImplementedError() data_node = nodes.AccessNode(data_label) body.add_node(data_node) nested_out_data_nodes.update({i: data_node}) nested_out_connectors.update({i: data_label}) nested_out_memlets.update({i: memlet}) for _, _, _, _, old_memlet in body.edges(): if old_memlet.data == memlet.data: old_memlet.data = data_label #body.add_edge(src, src_conn, data_node, None, memlet) # Add nested SDFG and reconnect it nested_node = graph.add_nested_sdfg( nested_sdfg, sdfg, set(nested_in_connectors.values()), set(nested_out_connectors.values())) for i, edge in enumerate(graph.in_edges(map_entry)): src, src_conn, dst, dst_conn, memlet = edge graph.add_edge(src, src_conn, nested_node, nested_in_connectors[i], nested_in_memlets[i]) for map_exit in map_exits: for i, edge in enumerate(graph.out_edges(map_exit)): src, src_conn, dst, dst_conn, memlet = edge graph.add_edge(nested_node, nested_out_connectors[i], dst, dst_conn, nested_out_memlets[i]) for src, src_conn, dst, dst_conn, memlet in graph.out_edges(map_entry): i = int(src_conn[4:]) - 1 new_memlet = dcpy(memlet) new_memlet.data = nested_in_data_nodes[i].data body.add_edge(nested_in_data_nodes[i], None, dst, dst_conn, new_memlet) for map_exit in map_exits: for src, src_conn, dst, dst_conn, memlet in graph.in_edges( map_exit): i = int(dst_conn[3:]) - 1 new_memlet = dcpy(memlet) new_memlet.data = nested_out_data_nodes[i].data body.add_edge(src, src_conn, nested_out_data_nodes[i], None, new_memlet) for node in map_subgraph: graph.remove_node(node)
def expansion(node, parent_state, parent_sdfg): sdfg = dace.SDFG(node.label + "_outer") state = sdfg.add_state(node.label + "_outer") (inputs, outputs, shape, field_to_data, field_to_desc, field_to_edge, vector_lengths) = parse_connectors(node, parent_state, parent_sdfg) ####################################################################### # Parse the tasklet code ####################################################################### # Replace relative indices with memlet names converter = SubscriptConverter() # Add copy boundary conditions for field in node.boundary_conditions: if node.boundary_conditions[field]["btype"] == "copy": center_index = tuple(0 for _ in range( len(parent_sdfg.arrays[field_to_data[field]].shape))) # This will register the renaming converter.convert(field, center_index) # Replace accesses in the code code, field_accesses = parse_accesses(node.code.as_string, outputs) iterator_mapping = make_iterator_mapping(node, field_accesses, shape) vector_length = validate_vector_lengths(vector_lengths, iterator_mapping) shape_vectorized = tuple(s / vector_length if i == len(shape) - 1 else s for i, s in enumerate(shape)) # Extract which fields to read from streams and what to buffer buffer_sizes = collections.OrderedDict() buffer_accesses = collections.OrderedDict() scalars = {} # {name: type} for field_name in inputs: relative = field_accesses[field_name] dim_mask = iterator_mapping[field_name] if not any(dim_mask): # This is a scalar, no buffer needed. Instead, the SDFG must # take this as a symbol scalars[field_name] = parent_sdfg.symbols[field_name] sdfg.add_symbol(field_name, parent_sdfg.symbols[field_name]) continue abs_indices = ([ dim_to_abs_val(i, tuple(s for s, m in zip(shape, dim_mask) if m), parent_sdfg) for i in relative ] + ([0] if field_name in node.boundary_conditions and node.boundary_conditions[field_name]["btype"] == "copy" else [])) max_access = max(abs_indices) min_access = min(abs_indices) buffer_size = max_access - min_access + vector_lengths[field_name] buffer_sizes[field_name] = buffer_size # (indices relative to center, buffer indices, center index) buffer_accesses[field_name] = ([tuple(r) for r in relative], [ i - min_access for i in abs_indices ], -min_access) # Create a initialization phase corresponding to the highest distance # to the center init_sizes = [ (buffer_sizes[key] - vector_lengths[key] - val[2]) // vector_length for key, val in buffer_accesses.items() ] init_size_max = int(np.max(init_sizes)) parameters = [f"_i{i}" for i in range(len(shape))] # Dimensions we need to iterate over iterator_mask = np.array([s != 0 and s != 1 for s in shape], dtype=bool) iterators = make_iterators( tuple(s for s, m in zip(shape_vectorized, iterator_mask) if m), parameters=tuple(s for s, m in zip(parameters, iterator_mask) if m)) # Manually add pipeline entry and exit nodes pipeline_range = dace.properties.SubsetProperty.from_string(', '.join( iterators.values())) pipeline = dace.sdfg.nodes.Pipeline( "compute_" + node.label, list(iterators.keys()), pipeline_range, dace.dtypes.ScheduleType.FPGA_Device, False, init_size=init_size_max, init_overlap=False, drain_size=init_size_max, drain_overlap=True) entry = dace.sdfg.nodes.PipelineEntry(pipeline) exit = dace.sdfg.nodes.PipelineExit(pipeline) state.add_nodes_from([entry, exit]) # Add nested SDFG to do 1) shift buffers 2) read from input 3) compute nested_sdfg = dace.SDFG(node.label + "_inner", parent=state) nested_sdfg_tasklet = state.add_nested_sdfg( nested_sdfg, sdfg, # Input connectors [k + "_in" for k in inputs if any(iterator_mapping[k])] + [name + "_buffer_in" for name, _ in buffer_sizes.items()], # Output connectors [k + "_out" for k in outputs] + [name + "_buffer_out" for name, _ in buffer_sizes.items()], schedule=dace.ScheduleType.FPGA_Device) # Propagate symbols for sym_name, sym_type in parent_sdfg.symbols.items(): nested_sdfg.add_symbol(sym_name, sym_type) nested_sdfg_tasklet.symbol_mapping[sym_name] = sym_name # Map iterators for p in parameters: nested_sdfg.add_symbol(p, dace.int64) nested_sdfg_tasklet.symbol_mapping[p] = p # Shift state, which shifts all buffers by one shift_state = nested_sdfg.add_state(node.label + "_shift") # Update state, which reads new values from memory update_state = nested_sdfg.add_state(node.label + "_update") ####################################################################### # Implement boundary conditions ####################################################################### boundary_code, oob_cond = generate_boundary_conditions( node, shape, field_accesses, field_to_desc, iterator_mapping) ####################################################################### # Only write if we're in bounds ####################################################################### write_code = ("\n".join([ "{}_inner_out = {}\n".format( output, field_accesses[output][tuple(0 for _ in range(len(shape)))]) for output in outputs ])) if init_size_max > 0 or len(oob_cond) > 0: write_cond = [] if init_size_max > 0: init_cond = pipeline.init_condition() write_cond.append("not " + init_cond) nested_sdfg_tasklet.symbol_mapping[init_cond] = init_cond nested_sdfg.add_symbol(init_cond, dace.bool) if len(oob_cond) > 0: oob_cond = " or ".join(sorted(oob_cond)) oob_cond = f"not ({oob_cond})" write_cond.append(oob_cond) write_cond = " and ".join(write_cond) write_cond = f"if {write_cond}:\n\t" else: write_cond = "" code = boundary_code + "\n" + code + "\n" + write_code ####################################################################### # Create DaCe compute state ####################################################################### # Compute state, which reads from input channels, performs the compute, # and writes to the output channel(s) compute_state = nested_sdfg.add_state(node.label + "_compute") compute_inputs = list( itertools.chain.from_iterable( [["_" + v for v in field_accesses[f].values()] for f in inputs if any(iterator_mapping[f])])) compute_tasklet = compute_state.add_tasklet( node.label + "_compute", compute_inputs, {name + "_inner_out" for name in outputs}, code, language=dace.dtypes.Language.Python) if vector_length > 1: compute_unroll_entry, compute_unroll_exit = compute_state.add_map( compute_state.label + "_unroll", {"i_unroll": f"0:{vector_length}"}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Connect the three nested states nested_sdfg.add_edge(shift_state, update_state, dace.sdfg.InterstateEdge()) nested_sdfg.add_edge(update_state, compute_state, dace.sdfg.InterstateEdge()) # First, grab scalar variables for scalar, scalar_type in scalars.items(): nested_sdfg.add_symbol(scalar, scalar_type) # Code to increment custom iterators iterator_code = "" for (field_name, size), init_size in zip(buffer_sizes.items(), init_sizes): data_name = field_to_data[field_name] connector = field_to_edge[field_name].dst_conn data_name_outer = connector data_name_inner = field_name + "_in" desc_outer = parent_sdfg.arrays[data_name].clone() desc_outer.transient = False sdfg.add_datadesc(data_name_outer, desc_outer) mapping = iterator_mapping[field_name] is_array = not isinstance(desc_outer, dt.Stream) # If this array is part of the initialization phase, it needs its # own iterator, which we need to instantiate and increment in the # outer SDFG if is_array: if init_size == 0: field_index = [s for s, p in zip(parameters, mapping) if p] else: # Create custom iterators for this array num_dims = sum(mapping, 0) field_iterators = [(f"_{field_name}_i{i}", shape[i]) for i in range(num_dims) if mapping[i]] start_index = init_size_max - init_size tab = "" if start_index > 0: iterator_code += ( f"if {pipeline.iterator_str()} >= {start_index}:\n" ) tab += " " for i, (it, s) in enumerate(reversed(field_iterators)): iterator_code += f"""\ {tab}if {it} < {s} - 1: {tab} {it} = {it} + 1 {tab}else: {tab} {it} = 0\n""" tab += " " field_index = [fi[0] for fi in field_iterators] for fi in field_index: pipeline.additional_iterators[fi] = "0" nested_sdfg.add_symbol(fi, dace.int64) nested_sdfg_tasklet.symbol_mapping[fi] = fi field_index = ", ".join(field_index) else: field_index = "0" # Begin reading according to this field's own buffer size, which is # translated to an index by subtracting it from the maximum buffer # size begin_reading = init_size_max - init_size total_size = functools.reduce(operator.mul, shape_vectorized, 1) end_reading = total_size + init_size_max - init_size # Outer memory read read_node_outer = state.add_read(data_name_outer) if begin_reading != 0 or end_reading != total_size + init_size_max: sdfg.add_scalar(f"{field_name}_wavefront", desc_outer.dtype, storage=dace.StorageType.FPGA_Local, transient=True) wavefront_access = state.add_access(f"{field_name}_wavefront") condition = [] it = pipeline.iterator_str() if begin_reading != 0: condition.append(f"{it} >= {begin_reading}") if end_reading != total_size + init_size_max: condition.append(f"{it} < {end_reading}") condition = " and ".join(condition) update_tasklet = state.add_tasklet( f"read_{field_name}", {"wavefront_in"}, {"wavefront_out"}, f"if {condition}:\n" "\twavefront_out = wavefront_in\n", language=dace.dtypes.Language.Python) state.add_memlet_path(read_node_outer, entry, update_tasklet, dst_conn="wavefront_in", memlet=dace.Memlet( f"{data_name_outer}[{field_index}]", dynamic=True)) state.add_memlet_path(update_tasklet, wavefront_access, src_conn="wavefront_out", memlet=dace.Memlet( f"{field_name}_wavefront", dynamic=True)) state.add_memlet_path( wavefront_access, nested_sdfg_tasklet, dst_conn=f"{field_name}_in", memlet=dace.Memlet(f"{field_name}_wavefront")) else: state.add_memlet_path( read_node_outer, entry, nested_sdfg_tasklet, dst_conn=f"{field_name}_in", memlet=dace.Memlet(f"{data_name_outer}[{field_index}]")) # Create inner memory access nested_sdfg.add_scalar(data_name_inner, desc_outer.dtype, storage=dace.StorageType.FPGA_Local, transient=False) buffer_name_outer = f"{node.label}_{field_name}_buffer" buffer_name_inner_read = f"{field_name}_buffer_in" buffer_name_inner_write = f"{field_name}_buffer_out" # Create buffer transient in outer SDFG field_dtype = parent_sdfg.data(data_name).dtype _, desc_outer = sdfg.add_array( buffer_name_outer, (size, ), field_dtype.base_type, storage=dace.dtypes.StorageType.FPGA_Local, transient=True) # Create read and write nodes read_node_outer = state.add_read(buffer_name_outer) write_node_outer = state.add_write(buffer_name_outer) # Outer buffer read state.add_memlet_path( read_node_outer, entry, nested_sdfg_tasklet, dst_conn=buffer_name_inner_read, memlet=dace.Memlet(f"{buffer_name_outer}[0:{size}]")) # Outer buffer write state.add_memlet_path(nested_sdfg_tasklet, exit, write_node_outer, src_conn=buffer_name_inner_write, memlet=dace.Memlet( f"{write_node_outer.data}[0:{size}]", dynamic=True)) # Inner copy desc_inner_read = desc_outer.clone() desc_inner_read.transient = False desc_inner_read.name = buffer_name_inner_read desc_inner_write = desc_inner_read.clone() desc_inner_write.name = buffer_name_inner_write nested_sdfg.add_datadesc(buffer_name_inner_read, desc_inner_read) nested_sdfg.add_datadesc(buffer_name_inner_write, desc_inner_write) # Make shift state if necessary if size > 1: shift_read = shift_state.add_read(buffer_name_inner_read) shift_write = shift_state.add_write(buffer_name_inner_write) shift_entry, shift_exit = shift_state.add_map( f"shift_{field_name}", {"i_shift": f"0:{size} - {vector_lengths[field_name]}"}, schedule=dace.dtypes.ScheduleType.FPGA_Device, unroll=True) shift_tasklet = shift_state.add_tasklet( f"shift_{field_name}", {f"{field_name}_shift_in"}, {f"{field_name}_shift_out"}, f"{field_name}_shift_out = {field_name}_shift_in") shift_state.add_memlet_path( shift_read, shift_entry, shift_tasklet, dst_conn=field_name + "_shift_in", memlet=dace.Memlet( f"{shift_read.data}" f"[i_shift + {vector_lengths[field_name]}]")) shift_state.add_memlet_path( shift_tasklet, shift_exit, shift_write, src_conn=field_name + "_shift_out", memlet=dace.Memlet(f"{shift_write.data}[i_shift]")) # Make update state update_read = update_state.add_read(data_name_inner) update_write = update_state.add_write(buffer_name_inner_write) subset = f"{size} - {vector_length}:{size}" if size > 1 else "0" update_state.add_memlet_path(update_read, update_write, memlet=dace.Memlet( f"{update_read.data}", other_subset=f"{subset}")) # Make compute state compute_read = compute_state.add_read(buffer_name_inner_read) for relative, offset in zip(buffer_accesses[field_name][0], buffer_accesses[field_name][1]): memlet_name = field_accesses[field_name][tuple(relative)] if vector_length > 1: if vector_lengths[field_name] > 1: offset = f"{offset} + i_unroll" else: offset = str(offset) path = [ compute_read, compute_unroll_entry, compute_tasklet ] else: offset = str(offset) path = [compute_read, compute_tasklet] compute_state.add_memlet_path( *path, dst_conn="_" + memlet_name, memlet=dace.Memlet(f"{compute_read.data}[{offset}]")) # Tasklet to update iterators if iterator_code: update_iterator_tasklet = state.add_tasklet( f"{node.label}_update_iterators", {}, {}, iterator_code) state.add_memlet_path(nested_sdfg_tasklet, update_iterator_tasklet, memlet=dace.Memlet()) state.add_memlet_path(update_iterator_tasklet, exit, memlet=dace.Memlet()) for field_name in outputs: for offset in field_accesses[field_name]: if offset is not None and list(offset) != [0] * len(offset): raise NotImplementedError("Output offsets not implemented") data_name = field_to_data[field_name] # Outer write data_name_outer = field_name data_name_inner = field_name + "_out" desc_outer = parent_sdfg.arrays[data_name].clone() desc_outer.transient = False array_index = ", ".join(map(str, parameters)) try: sdfg.add_datadesc(data_name_outer, desc_outer) except NameError: # Already an input pass # Create inner access nested_sdfg.add_scalar(data_name_inner, desc_outer.dtype, storage=dace.StorageType.FPGA_Local, transient=False) # Inner write write_node_inner = compute_state.add_write(data_name_inner) # Intermediate buffer, mostly relevant for vectorization output_buffer_name = field_name + "_output_buffer" nested_sdfg.add_array(output_buffer_name, (vector_length, ), desc_outer.dtype.base_type, storage=dace.StorageType.FPGA_Registers, transient=True) output_buffer = compute_state.add_access(output_buffer_name) # If vectorized, we need to pass through the unrolled scope if vector_length > 1: compute_state.add_memlet_path( compute_tasklet, compute_unroll_exit, output_buffer, src_conn=field_name + "_inner_out", memlet=dace.Memlet(f"{output_buffer_name}[i_unroll]")) else: compute_state.add_memlet_path( compute_tasklet, output_buffer, src_conn=field_name + "_inner_out", memlet=dace.Memlet(f"{output_buffer_name}[0]")), # Final memlet to the output compute_state.add_memlet_path( output_buffer, write_node_inner, memlet=dace.Memlet(f"{write_node_inner.data}")), # Conditional write tasklet sdfg.add_scalar(f"{field_name}_result", desc_outer.dtype, storage=dace.StorageType.FPGA_Local, transient=True) output_access = state.add_access(f"{field_name}_result") state.add_memlet_path(nested_sdfg_tasklet, output_access, src_conn=data_name_inner, memlet=dace.Memlet(f"{field_name}_result")) output_tasklet = state.add_tasklet( f"{field_name}_conditional_write", {f"_{field_name}_result"}, {f"_{data_name_inner}"}, (write_cond + f"_{data_name_inner} = _{field_name}_result")) state.add_memlet_path(output_access, output_tasklet, dst_conn=f"_{field_name}_result", memlet=dace.Memlet(f"{field_name}_result")) write_node_outer = state.add_write(data_name_outer) if isinstance(desc_outer, dt.Stream): subset = "0" else: subset = array_index state.add_memlet_path(output_tasklet, exit, write_node_outer, src_conn=f"_{data_name_inner}", memlet=dace.Memlet( f"{write_node_outer.data}[{subset}]", dynamic=True)), return sdfg
def make_sdfg(name="transpose"): n = dace.symbol("N") m = dace.symbol("M") sdfg = dace.SDFG(name) pre_state = sdfg.add_state(name + "_pre") state = sdfg.add_state(name) post_state = sdfg.add_state(name + "_post") sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) _, desc_input_host = sdfg.add_array("a_input", (n, m), dace.float64) _, desc_output_host = sdfg.add_array("a_output", (m, n), dace.float64) desc_input_device = copy.copy(desc_input_host) desc_input_device.storage = dace.StorageType.FPGA_Global desc_input_device.location["bank"] = 0 desc_input_device.transient = True desc_output_device = copy.copy(desc_output_host) desc_output_device.storage = dace.StorageType.FPGA_Global desc_output_device.location["bank"] = 1 desc_output_device.transient = True sdfg.add_datadesc("a_input_device", desc_input_device) sdfg.add_datadesc("a_output_device", desc_output_device) # Host to device pre_read = pre_state.add_read("a_input") pre_write = pre_state.add_write("a_input_device") pre_state.add_memlet_path(pre_read, pre_write, memlet=dace.Memlet.simple(pre_write, "0:N, 0:M")) # Device to host post_read = post_state.add_read("a_output_device") post_write = post_state.add_write("a_output") post_state.add_memlet_path(post_read, post_write, memlet=dace.Memlet.simple( post_write, "0:N, 0:M")) # Compute state read = state.add_read("a_input_device") write = state.add_write("a_output_device") # Trivial tasklet tasklet = state.add_tasklet(name, {"_in"}, {"_out"}, "_out = _in") entry, exit = state.add_map(name, { "i": "0:N", "j": "0:M", }, schedule=dace.ScheduleType.FPGA_Device) state.add_memlet_path(read, entry, tasklet, dst_conn="_in", memlet=dace.Memlet.simple("a_input_device", "i, j", num_accesses=1)) state.add_memlet_path(tasklet, exit, write, src_conn="_out", memlet=dace.Memlet.simple("a_output_device", "j, i", num_accesses=1)) return sdfg
# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. """ SDFG API sample that showcases nested SDFG creation. """ import dace import numpy as np # Create outer SDFG sdfg = dace.SDFG('nested_main') # Add global array sdfg.add_array('A', [2], dace.float32) # Sample state contents def mystate(state, src, dst): src_node = state.add_read(src) dst_node = state.add_write(dst) tasklet = state.add_tasklet('aaa2', {'a'}, {'b'}, 'b = a + 1') # input path (src->tasklet[a]) state.add_memlet_path(src_node, tasklet, dst_conn='a', memlet=dace.Memlet(data=src, subset='0')) # output path (tasklet[b]->dst) state.add_memlet_path(tasklet, dst_node, src_conn='b', memlet=dace.Memlet(data=dst, subset='0')) # Create nested SDFG
def make_sdfg(squeeze, name): N, M = dace.symbol('N'), dace.symbol('M') sdfg = dace.SDFG('memlet_propagation_%s' % name) sdfg.add_symbol('N', dace.int64) sdfg.add_symbol('M', dace.int64) sdfg.add_array('A', [N + 1, M], dace.int64) state = sdfg.add_state() me, mx = state.add_map('map', dict(j='1:M')) w = state.add_write('A') # Create nested SDFG nsdfg = dace.SDFG('nested') if squeeze: nsdfg.add_array('a1', [N + 1], dace.int64, strides=[M]) nsdfg.add_array('a2', [N - 1], dace.int64, strides=[M]) else: nsdfg.add_array('a', [N + 1, M], dace.int64) nstate = nsdfg.add_state() a1 = nstate.add_write('a1' if squeeze else 'a') a2 = nstate.add_write('a2' if squeeze else 'a') t1 = nstate.add_tasklet('add99', {}, {'out'}, 'out = i + 99') t2 = nstate.add_tasklet('add101', {}, {'out'}, 'out = i + 101') nstate.add_edge(t1, 'out', a1, None, dace.Memlet('a1[i]' if squeeze else 'a[i, 1]')) nstate.add_edge(t2, 'out', a2, None, dace.Memlet('a2[i]' if squeeze else 'a[i+2, 0]')) nsdfg.add_loop(None, nstate, None, 'i', '0', 'i < N - 2', 'i + 1') # Connect nested SDFG to toplevel one nsdfg_node = state.add_nested_sdfg(nsdfg, None, {}, {'a1', 'a2'} if squeeze else {'a'}, symbol_mapping=dict(j='j', N='N', M='M')) state.add_nedge(me, nsdfg_node, dace.Memlet()) # Add outer memlet that is overapproximated if squeeze: # This is expected to propagate to A[0:N - 2, j]. state.add_memlet_path(nsdfg_node, mx, w, src_conn='a1', memlet=dace.Memlet('A[0:N+1, j]')) # This is expected to propagate to A[2:N, j - 1]. state.add_memlet_path(nsdfg_node, mx, w, src_conn='a2', memlet=dace.Memlet('A[2:N+1, j-1]')) else: # This memlet is expected to propagate to A[0:N, j - 1:j + 1]. state.add_memlet_path(nsdfg_node, mx, w, src_conn='a', memlet=dace.Memlet('A[0:N+1, j-1:j+1]')) propagation.propagate_memlets_sdfg(sdfg) return sdfg
def make_sdfg(name="fpga_stcl_test", dtype=dace.float32, veclen=8): vtype = dace.vector(dtype, veclen) n = dace.symbol("N") m = dace.symbol("M") sdfg = dace.SDFG(name) pre_state = sdfg.add_state(name + "_pre") state = sdfg.add_state(name) post_state = sdfg.add_state(name + "_post") sdfg.add_edge(pre_state, state, dace.InterstateEdge()) sdfg.add_edge(state, post_state, dace.InterstateEdge()) _, desc_input_host = sdfg.add_array("a", (n, m / veclen), vtype) _, desc_output_host = sdfg.add_array("b", (n, m / veclen), vtype) desc_input_device = copy.copy(desc_input_host) desc_input_device.storage = dace.StorageType.FPGA_Global desc_input_device.location["bank"] = 0 desc_input_device.transient = True desc_output_device = copy.copy(desc_output_host) desc_output_device.storage = dace.StorageType.FPGA_Global desc_output_device.location["bank"] = 1 desc_output_device.transient = True sdfg.add_datadesc("a_device", desc_input_device) sdfg.add_datadesc("b_device", desc_output_device) # Host to device pre_read = pre_state.add_read("a") pre_write = pre_state.add_write("a_device") pre_state.add_memlet_path( pre_read, pre_write, memlet=dace.Memlet(f"a_device[0:N, 0:M/{veclen}]")) # Device to host post_read = post_state.add_read("b_device") post_write = post_state.add_write("b") post_state.add_memlet_path( post_read, post_write, memlet=dace.Memlet(f"b_device[0:N, 0:M/{veclen}]")) # Compute state read_memory = state.add_read("a_device") write_memory = state.add_write("b_device") # Memory streams sdfg.add_stream("a_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_stream("b_stream", vtype, storage=dace.StorageType.FPGA_Local, transient=True) produce_input_stream = state.add_write("a_stream") consume_input_stream = state.add_read("a_stream") produce_output_stream = state.add_write("b_stream") consume_output_stream = state.add_write("b_stream") tasklet = state.add_tasklet( name, {"_north", "_west", "_east", "_south"}, {"result"}, """\ north = _north if i >= 1 else 1 west = _west if {W}*j + u >= 1 else 1 east = _east if {W}*j + u < M - 1 else 1 south = _south if i < N - 1 else 1 result = 0.25 * (north + west + east + south)""".format(W=veclen)) entry, exit = state.add_pipeline(name, { "i": "0:N", "j": "0:M/{}".format(veclen), }, schedule=dace.ScheduleType.FPGA_Device, init_size=m / veclen, init_overlap=False, drain_size=m / veclen, drain_overlap=True) # Unrolled map unroll_entry, unroll_exit = state.add_map( name + "_unroll", {"u": "0:{}".format(veclen)}, schedule=dace.ScheduleType.FPGA_Device, unroll=True) # Container-to-container copies between arrays and streams state.add_memlet_path(read_memory, produce_input_stream, memlet=dace.Memlet( f"{read_memory.data}[0:N, 0:M/{veclen}]", other_subset="0")) state.add_memlet_path(consume_output_stream, write_memory, memlet=dace.Memlet( write_memory.data, f"{write_memory.data}[0:N, 0:M/{veclen}]", other_subset="0")) # Container-to-container copy from vectorized stream to non-vectorized # buffer sdfg.add_array("input_buffer", (1, ), vtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("shift_register", (2 * m + veclen, ), dtype, storage=dace.StorageType.FPGA_ShiftRegister, transient=True) sdfg.add_array("output_buffer", (veclen, ), dtype, storage=dace.StorageType.FPGA_Local, transient=True) sdfg.add_array("output_buffer_packed", (1, ), vtype, storage=dace.StorageType.FPGA_Local, transient=True) input_buffer = state.add_access("input_buffer") shift_register = state.add_access("shift_register") output_buffer = state.add_access("output_buffer") output_buffer_packed = state.add_access("output_buffer_packed") # Only write if not initializing read_tasklet = state.add_tasklet( name + "_conditional_read", {"_in"}, {"_out"}, "if not {}:\n\t_out = _in".format(entry.pipeline.drain_condition())) # Input stream to buffer state.add_memlet_path(consume_input_stream, entry, read_tasklet, dst_conn="_in", memlet=dace.Memlet(f"{consume_input_stream.data}[0]", dynamic=True)) state.add_memlet_path(read_tasklet, input_buffer, src_conn="_out", memlet=dace.Memlet(f"{input_buffer.data}[0]")) state.add_memlet_path(input_buffer, shift_register, memlet=dace.Memlet(f"{input_buffer.data}[0]", other_subset=f"2*M:(2*M + {veclen})")) # Stencils accesses state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_north", memlet=dace.Memlet(f"{shift_register.data}[u]")) # North state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_west", memlet=dace.Memlet(f"{shift_register.data}[u + M - 1]")) # West state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_east", memlet=dace.Memlet(f"{shift_register.data}[u + M + 1]")) # East state.add_memlet_path( shift_register, unroll_entry, tasklet, dst_conn="_south", memlet=dace.Memlet(f"{shift_register.data}[u + 2 * M]")) # South # Tasklet to buffer state.add_memlet_path(tasklet, unroll_exit, output_buffer, src_conn="result", memlet=dace.Memlet(f"{output_buffer.data}[u]")) # Pack buffer state.add_memlet_path(output_buffer, output_buffer_packed, memlet=dace.Memlet(f"{output_buffer_packed.data}[0]", other_subset=f"0:{veclen}")) # Only write if not initializing write_tasklet = state.add_tasklet( name + "_conditional_write", {"_in"}, {"_out"}, "if not {}:\n\t_out = _in".format(entry.pipeline.init_condition())) # Buffer to output stream state.add_memlet_path(output_buffer_packed, write_tasklet, dst_conn="_in", memlet=dace.Memlet(f"{output_buffer_packed.data}[0]")) # Buffer to output stream state.add_memlet_path(write_tasklet, exit, produce_output_stream, src_conn="_out", memlet=dace.Memlet(f"{produce_output_stream.data}[0]", dynamic=True)) return sdfg
def test_tasklet_array(): """ Test the simple array execution sample. """ n = 128 N = dace.symbol('N') N.set(n) # add sdfg sdfg = dace.SDFG('rtl_tasklet_array') # add state state = sdfg.add_state() # add arrays sdfg.add_array('A', [N], dtype=dace.int32) sdfg.add_array('B', [N], dtype=dace.int32) # add custom cpp tasklet tasklet = state.add_tasklet(name='rtl_tasklet', inputs={'a'}, outputs={'b'}, code=''' always@(posedge ap_aclk) begin if (ap_areset) begin s_axis_a_tready <= 1; m_axis_b_tvalid <= 0; m_axis_b_tdata <= 0; end else if (s_axis_a_tvalid && s_axis_a_tready) begin s_axis_a_tready <= 0; m_axis_b_tvalid <= 1; m_axis_b_tdata <= s_axis_a_tdata + 42; end else if (m_axis_b_tvalid && m_axis_b_tready) begin s_axis_a_tready <= 1; m_axis_b_tvalid <= 0; m_axis_b_tdata <= 0; end end ''', language=dace.Language.SystemVerilog) # add input/output array A = state.add_read('A') B = state.add_write('B') # connect input/output array with the tasklet state.add_edge(A, None, tasklet, 'a', dace.Memlet('A[0:N]')) state.add_edge(tasklet, 'b', B, None, dace.Memlet('B[0:N]')) # validate sdfg sdfg.specialize({'N': N.get()}) sdfg.validate() # init data structures a = np.random.randint(0, 100, N.get()).astype(np.int32) b = np.zeros((N.get(), )).astype(np.int32) # call program sdfg(A=a, B=b) # check result assert (b == a + 42).all()