def create_batch_gemm_sdfg(dtype, strides): ######################### sdfg = SDFG('einsum') state = sdfg.add_state() M, K, N = (symbolic.symbol(s) for s in ['M', 'K', 'N']) BATCH, sAM, sAK, sAB, sBK, sBN, sBB, sCM, sCN, sCB = ( symbolic.symbol(s) if symbolic.issymbolic(strides[s]) else strides[s] for s in [ 'BATCH', 'sAM', 'sAK', 'sAB', 'sBK', 'sBN', 'sBB', 'sCM', 'sCN', 'sCB' ]) batched = strides['BATCH'] != 1 _, xarr = sdfg.add_array( 'X', dtype=dtype, shape=[BATCH, M, K] if batched else [M, K], strides=[sAB, sAM, sAK] if batched else [sAM, sAK]) _, yarr = sdfg.add_array( 'Y', dtype=dtype, shape=[BATCH, K, N] if batched else [K, N], strides=[sBB, sBK, sBN] if batched else [sBK, sBN]) _, zarr = sdfg.add_array( 'Z', dtype=dtype, shape=[BATCH, M, N] if batched else [M, N], strides=[sCB, sCM, sCN] if batched else [sCM, sCN]) gX = state.add_read('X') gY = state.add_read('Y') gZ = state.add_write('Z') import dace.libraries.blas as blas # Avoid import loop libnode = blas.MatMul('einsum_gemm') state.add_node(libnode) state.add_edge(gX, None, libnode, '_a', Memlet.from_array(gX.data, xarr)) state.add_edge(gY, None, libnode, '_b', Memlet.from_array(gY.data, yarr)) state.add_edge(libnode, '_c', gZ, None, Memlet.from_array(gZ.data, zarr)) return sdfg
def test(): print('Dynamic SDFG test with vectorization and min') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = np.random.rand(N.get()).astype(np.float32) input2 = np.random.rand(N.get()).astype(np.float32) output = dp.ndarray([N], dp.float32) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('myvmin') mysdfg.add_array('A', [N], dp.float32) mysdfg.add_array('B', [N], dp.float32) mysdfg.add_array('C', [N], dp.float32) state = mysdfg.add_state() A = state.add_access('A') B = state.add_access('B') C = state.add_access('C') tasklet, map_entry, map_exit = state.add_mapped_tasklet( 'mytasklet', dict(i='0:N:2'), dict(a=Memlet.simple(A, 'i'), b=Memlet.simple(B, 'i')), 'c = min(a, b)', dict(c=Memlet.simple(C, 'i'))) # Manually vectorize tasklet tasklet.in_connectors['a'] = dp.vector(dp.float32, 2) tasklet.in_connectors['b'] = dp.vector(dp.float32, 2) tasklet.out_connectors['c'] = dp.vector(dp.float32, 2) # Add outer edges state.add_edge(A, None, map_entry, None, Memlet.simple(A, '0:N')) state.add_edge(B, None, map_entry, None, Memlet.simple(B, '0:N')) state.add_edge(map_exit, None, C, None, Memlet.simple(C, '0:N')) mysdfg(A=input, B=input2, C=output, N=N) diff = np.linalg.norm(np.minimum(input, input2) - output) / N.get() print("Difference:", diff) print("==== Program end ====") assert diff <= 1e-5
def test_3_interface_to_2_banks(): sdfg = SDFG("test_4_interface_to_2_banks") state = sdfg.add_state() _, desc_a = sdfg.add_array("a", [2, 2], dace.int32) desc_a.location["memorytype"] = "HBM" desc_a.location["bank"] = "0:2" acc_read1 = state.add_read("a") acc_write1 = state.add_write("a") t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2") m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled) state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1") state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2") state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1") sdfg.apply_fpga_transformations() assert sdfg.apply_transformations(InlineSDFG) == 1 assert sdfg.apply_transformations(MapUnroll) == 1 for node in sdfg.states()[0].nodes(): if isinstance(node, dace.sdfg.nodes.Tasklet): sdfg.states()[0].out_edges( node)[0].data.subset = subsets.Range.from_string("1, 1") break bank_assignment = sdfg.generate_code()[3].clean_code assert bank_assignment.count("sp") == 6 assert bank_assignment.count("HBM[0]") == 3 assert bank_assignment.count("HBM[1]") == 3 a = np.zeros([2, 2], np.int32) a[0, 0] = 2 a[1, 0] = 3 sdfg(a=a) assert a[0, 1] == 5 return sdfg
def four_interface_to_2_banks(mem_type, decouple_interfaces): sdfg = SDFG("test_4_interface_to_2_banks_" + mem_type) state = sdfg.add_state() _, desc_a = sdfg.add_array("a", [2, 2], dace.int32) desc_a.location["memorytype"] = mem_type desc_a.location["bank"] = "0:2" acc_read1 = state.add_read("a") acc_write1 = state.add_write("a") t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2") m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled) state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1") state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2") state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1") sdfg.apply_fpga_transformations() assert sdfg.apply_transformations(InlineSDFG) == 1 assert sdfg.apply_transformations(MapUnroll) == 1 for node in sdfg.states()[0].nodes(): if isinstance(node, dace.sdfg.nodes.Tasklet): sdfg.states()[0].out_edges(node)[0].data.subset = subsets.Range.from_string("1, 1") break with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=decouple_interfaces): bank_assignment = sdfg.generate_code()[3].clean_code # if we are not decoupling array interfaces we will use less mem interfaces assert bank_assignment.count("sp") == 6 if decouple_interfaces else 4 assert bank_assignment.count(mem_type + "[0]") == 3 if decouple_interfaces else 2 assert bank_assignment.count(mem_type + "[1]") == 3 if decouple_interfaces else 2 a = np.zeros([2, 2], np.int32) a[0, 0] = 2 a[1, 0] = 3 sdfg(a=a) assert a[0, 1] == 5 return sdfg
class ONNXModel: """Loads an ONNX model into an SDFG.""" def __init__(self, name, model: onnx.ModelProto, cuda=False): """ Constructs a new ONNXImporter. :param name: the name for the SDFG. :param model: the model to import. :param cuda: if `True`, weights will be passed as cuda arrays. """ graph: onnx.GraphProto = model.graph self.sdfg = SDFG(name) self.cuda = cuda self.state = self.sdfg.add_state() # Add all values to the SDFG, check for unsupported ops ########################################## self.value_infos = {} self.inputs = [] self.outputs = [] for value, is_input in chain(zip(graph.input, repeat(True)), zip(graph.output, repeat(False))): if not value.HasField("name"): raise ValueError("Got input or output without name") if is_input: self.inputs.append(value.name) else: self.outputs.append(value.name) self.value_infos[value.name] = value self._add_value_info(value) for value in graph.value_info: if not value.HasField("name"): raise ValueError("Got input or output without name") if value.name not in self.value_infos: self.value_infos[value.name] = value # add weights self.weights = {} for init in graph.initializer: self._add_constant_tensor(init) access_nodes = {} self._idx_to_node = [] for i, node in enumerate(graph.node): if not has_onnx_node(node.op_type): raise ValueError("Unsupported ONNX operator: '{}'".format( node.op_type)) # extract the op attributes op_attributes = { attribute_proto.name: convert_attribute_proto(attribute_proto) for attribute_proto in node.attribute } if node.HasField("name"): node_name = clean_onnx_name(node.name) else: node_name = node.op_type + "_" + str(i) # construct the dace node op_node = get_onnx_node(node.op_type)(node_name, **op_attributes) self.state.add_node(op_node) self._idx_to_node.append(op_node) for param_idx, (name, is_input) in chain( enumerate(zip(node.input, repeat(True))), enumerate(zip(node.output, repeat(False)))): if clean_onnx_name(name) not in self.sdfg.arrays: if name not in self.value_infos: raise ValueError( "Could not find array with name '{}'".format(name)) self._add_value_info(self.value_infos[name]) # get the access node if name in access_nodes: access = access_nodes[name] self._update_access_type(access, is_input) else: access = nd.AccessNode( clean_onnx_name(name), AccessType.ReadOnly if is_input else AccessType.WriteOnly) self.state.add_node(access) access_nodes[name] = access # get the connector name params = op_node.schema.inputs if is_input else op_node.schema.outputs params_len = len(params) if param_idx >= params_len: # this is a variadic parameter. Then the last parameter of the parameter must be variadic. if params[-1].param_type != ONNXParameterType.Variadic: raise ValueError( "Expected the last {i_or_o} parameter to be variadic," " since the {i_or_o} with idx {param_idx} has more parameters than the schema ({params_len})" .format(i_or_o="input" if is_input else "output", param_idx=param_idx, params_len=params_len)) conn_name = params[-1].name + "__" + str(param_idx - params_len + 1) elif params[ param_idx].param_type == ONNXParameterType.Variadic: # this is a variadic parameter, and it is within the range of params, so it must be the first # instance of a variadic parameter conn_name = params[param_idx].name + "__0" else: conn_name = params[param_idx].name data_desc = self.sdfg.arrays[clean_onnx_name(name)] # add the connector if required, and add an edge if is_input: if conn_name not in op_node.in_connectors: op_node.add_in_connector(conn_name) self.state.add_edge( access, None, op_node, conn_name, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) else: if conn_name not in op_node.out_connectors: op_node.add_out_connector(conn_name) self.state.add_edge( op_node, conn_name, access, None, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) if self.cuda: self.sdfg.apply_strict_transformations() self.sdfg.apply_gpu_transformations() self.sdfg.apply_strict_transformations() # set all gpu transients to be persistent for _, _, arr in self.sdfg.arrays_recursive(): if arr.transient and arr.storage == StorageType.GPU_Global: arr.lifetime = AllocationLifetime.Persistent @staticmethod def _update_access_type(node: dace.nodes.AccessNode, is_input: bool): if node.access == AccessType.ReadOnly and not is_input: node.access = AccessType.ReadWrite elif node.access == AccessType.WriteOnly and is_input: node.access = AccessType.ReadWrite def _add_constant_tensor(self, tensor: onnx.TensorProto): if not tensor.HasField("name"): raise ValueError("Got tensor without name") if not tensor.HasField("data_type"): raise ValueError("Initializer tensor '{}' has no type".format( tensor.name)) name = clean_onnx_name(tensor.name) dtype = onnx_tensor_type_to_typeclass(tensor.data_type) if len(tensor.dims) == 0: # this is a scalar self.sdfg.add_scalar(name, dtype) else: dims = [d for d in tensor.dims] if name not in self.sdfg.arrays: self.sdfg.add_array(name, dims, dtype) else: existing_arr = self.sdfg.arrays[name] if existing_arr.dtype != dtype: raise ValueError( "Invalid ONNX model; found two values with name '{}', but different dtypes ({} and {})" .format(name, existing_arr.dtype, dtype)) if tuple(existing_arr.shape) != tuple(dims): raise ValueError( "Invalid ONNX model; found two values with name '{}', but different dimensions ({} and {})" .format(name, existing_arr.shape, dims)) self.weights[tensor.name] = numpy_helper.to_array(tensor) def _add_value_info(self, value_info: onnx.ValueInfoProto): if not value_info.HasField("name"): raise ValueError("Got value without name") name = value_info.name if not _nested_HasField(value_info, "type.tensor_type.shape"): raise ValueError( "Value '{}' does not have a shape in this graph." " Please run shape inference before importing.".format(name)) tensor_type = value_info.type.tensor_type if not tensor_type.HasField("elem_type"): raise ValueError( "Value '{}' does not have a type in this graph." " Please run type inference before importing.".format(name)) shape = [] for d in tensor_type.shape.dim: if d.HasField("dim_value"): shape.append(d.dim_value) elif d.HasField("dim_param"): parsed = pystr_to_symbolic(d.dim_param) for sym in parsed.free_symbols: if clean_onnx_name(str(sym)) not in self.sdfg.symbols: self.sdfg.add_symbol(clean_onnx_name(str(sym)), stype=int) parsed = parsed.subs( sym, dace.symbol(clean_onnx_name(str(sym)))) shape.append(parsed) else: raise ValueError( "Value '{}' does not have a shape in this graph." " Please run shape inference before importing.".format( name)) transient = name not in self.inputs and name not in self.outputs if len(shape) == 0: self.sdfg.add_scalar(clean_onnx_name(name), dtype=onnx_tensor_type_to_typeclass( tensor_type.elem_type), transient=transient) else: self.sdfg.add_array(clean_onnx_name(name), shape=shape, dtype=onnx_tensor_type_to_typeclass( tensor_type.elem_type), transient=transient) def __call__(self, *args, **inputs): sdfg = deepcopy(self.sdfg) # convert the positional args to kwargs if len(args) > len(self.inputs): raise ValueError("Expected {} arguments, got {}".format( len(self.inputs), len(args))) inputs.update(dict(zip(self.inputs, args))) # check that there are no missing inputs if len(set(self.inputs).difference(inputs)) != 0: raise ValueError("Missing inputs {}".format(", ".join( set(self.inputs).difference(inputs)))) # check that there are no unknown inputs # NOTE symbols can only be passed as kwargs if len( set(inputs).difference(self.inputs).difference( sdfg.free_symbols)) != 0: raise ValueError("Unknown inputs {}".format(", ".join( set(inputs).difference(self.inputs)))) clean_inputs = {} for input, arr in inputs.items(): if input in sdfg.free_symbols: clean_inputs[input] = arr else: clean_inputs[clean_onnx_name(input)] = arr # add the weights params = {} for name, arr in self.weights.items(): if len(arr.shape) == 0: params[clean_onnx_name(name)] = arr[()] else: if self.cuda: clean_name = clean_onnx_name(name) sdfg.arrays[clean_name].storage = StorageType.GPU_Global params[clean_name] = numba.cuda.to_device(arr) else: params[clean_onnx_name(name)] = arr.copy() inferred_symbols = infer_symbols_from_shapes(sdfg, { **clean_inputs, **params }) # TODO @orausch if this is removed the SDFG complains # TypeError: Type mismatch for argument ONNX_unk__493: expected scalar type, got <class 'sympy.core.numbers.Integer'> # fix this better inferred_symbols = {k: int(v) for k, v in inferred_symbols.items()} def eval_dim(dim): for sym in dim.free_symbols: dim = dim.subs(sym, inferred_symbols[sym.name]) return dim outputs = OrderedDict() # create numpy arrays for the outputs for output in self.outputs: clean_name = clean_onnx_name(output) arr = sdfg.arrays[clean_name] # TODO @orausch add error handling for evalf shape = [ eval_dim(d) if type(d) is dace.symbol else d for d in arr.shape ] outputs[clean_name] = np.empty(shape, dtype=arr.dtype.as_numpy_dtype()) sdfg.expand_library_nodes() #sdfg.apply_strict_transformations() sdfg(**clean_inputs, **params, **outputs, **inferred_symbols) if len(outputs) == 1: return next(iter(outputs.values())) return tuple(outputs.values())
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) # If identity is defined, add an initialization state if node.identity is not None: init_state = nsdfg.add_state() nstate = nsdfg.add_state() nsdfg.add_edge(init_state, nstate, dace.InterstateEdge()) # Add initialization as a map init_state.add_mapped_tasklet( 'reduce_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size()) }, {}, 'out = %s' % node.identity, { 'out': dace.Memlet.simple( '_out', ','.join( ['_o%d' % i for i in range(output_dims)])) }, external_edges=True) else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = dace.Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return nsdfg
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG, partial_width=16): ''' :param node: the node to expand :param state: the state in which the node is in :param sdfg: the SDFG in which the node is in :param partial_width: Width of the inner reduction buffer. Must be larger than the latency of the reduction operation on the given data type ''' node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if input_data.dtype.veclen > 1: raise NotImplementedError( 'Vectorization currently not implemented for FPGA expansion of Reduce.' ) nstate = nsdfg.add_state() # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: all_axis = False # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append(f'_i{ictr}') ictr += 1 else: input_subset.append(f'_o{octr}') octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { f'_o{i}': f'0:{symstr(sz)}' for i, sz in enumerate(outedge.data.subset.size()) }) outm_idx = ','.join([f'_o{i}' for i in range(output_dims)]) outm = dace.Memlet(f'_out[{outm_idx}]') inm_idx = ','.join(input_subset) inmm = dace.Memlet(f'_in[{inm_idx}]') else: all_axis = True ome, omx = None, None outm = dace.Memlet('_out[0]') inm_idx = ','.join([f'_i{i}' for i in range(len(axes))]) inmm = dace.Memlet(f'_in[{inm_idx}]') # Add inner map, which corresponds to the range to reduce r = nstate.add_read('_in') w = nstate.add_read('_out') # TODO support vectorization buffer_name = 'partial_results' nsdfg.add_array(buffer_name, (partial_width, ), input_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) buffer = nstate.add_access(buffer_name) buffer_write = nstate.add_write(buffer_name) # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width init_me, init_mx = nstate.add_map( 'partial_results_init', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'}, f'pr_out = {node.identity}') nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet()) nstate.add_memlet_path(init_tasklet, init_mx, buffer, src_conn='pr_out', memlet=dace.Memlet(f'{buffer_name}[i]')) if not all_axis: nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet()) ime, imx = nstate.add_map( 'reduce_values', { f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}' for i, axis in enumerate(sorted(axes)) }) # Accumulate over partial results redtype = detect_reduction_type(node.wcr) if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR: raise ValueError('Reduction type not supported for "%s"' % node.wcr) else: reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[ redtype] # generate flatten index considering inner map: will be used for indexing into partial results ranges_size = ime.range.size() inner_index = '+'.join( [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)]) inner_op = ' + ' if len(axes) > 1 else '' inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}' partial_reduce_tasklet = nstate.add_tasklet( 'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\ prev = buffer_in buffer_out = {reduction_expr}''') if not all_axis: # Connect input and partial sums nstate.add_memlet_path(r, ome, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) else: nstate.add_memlet_path(r, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) nstate.add_memlet_path( buffer, ime, partial_reduce_tasklet, dst_conn='buffer_in', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) nstate.add_memlet_path( partial_reduce_tasklet, imx, buffer_write, src_conn='buffer_out', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) # Then perform reduction on partial results reduce_entry, reduce_exit = nstate.add_map( 'reduce', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) reduce_tasklet = nstate.add_tasklet( 'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\ prev = reduce_in if i > 0 else {node.identity} reduce_out = {reduction_expr}''') nstate.add_memlet_path(buffer_write, reduce_entry, reduce_tasklet, dst_conn='data_in', memlet=dace.Memlet(f'{buffer_name}[i]')) reduce_name = 'reduce_result' nsdfg.add_array(reduce_name, (1, ), output_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) reduce_read = nstate.add_access(reduce_name) reduce_access = nstate.add_access(reduce_name) if not all_axis: nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet()) nstate.add_memlet_path(reduce_read, reduce_entry, reduce_tasklet, dst_conn='reduce_in', memlet=dace.Memlet(f'{reduce_name}[0]')) nstate.add_memlet_path(reduce_tasklet, reduce_exit, reduce_access, src_conn='reduce_out', memlet=dace.Memlet(f'{reduce_name}[0]')) if not all_axis: # Write out the result nstate.add_memlet_path(reduce_access, omx, w, memlet=outm) else: nstate.add_memlet_path(reduce_access, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg.validate() return nsdfg
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] insubset = dcpy(inedge.data.subset) isqdim = insubset.squeeze() outsubset = dcpy(outedge.data.subset) osqdim = outsubset.squeeze() input_dims = len(insubset) output_dims = len(outsubset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] if len(osqdim) == 0: # Fix for scalars osqdim = [0] # Standardize and squeeze axes axes = node.axes if node.axes else [ i for i in range(len(inedge.data.subset)) ] axes = [axis for axis in axes if axis in isqdim] assert node.identity is not None # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', insubset.size(), input_data.dtype, strides=[ s for i, s in enumerate(input_data.strides) if i in isqdim ], storage=input_data.storage) nsdfg.add_array('_out', outsubset.size(), output_data.dtype, strides=[ s for i, s in enumerate(output_data.strides) if i in osqdim ], storage=output_data.storage) nsdfg.add_transient('acc', [1], nsdfg.arrays['_in'].dtype, dtypes.StorageType.Register) nstate = nsdfg.add_state() # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in isqdim: if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outsubset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)])) #wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) idt = nstate.add_tasklet('reset', {}, {'o'}, f'o = {node.identity}') nstate.add_edge(ome, None, idt, None, dace.Memlet()) accread = nstate.add_access('acc') accwrite = nstate.add_access('acc') nstate.add_edge(idt, 'o', accread, None, dace.Memlet('acc')) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map('reduce_values', { '_i%d' % i: '0:%s' % symstr(insubset.size()[isqdim.index(axis)]) for i, axis in enumerate(sorted(axes)) }, schedule=dtypes.ScheduleType.Sequential) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'a', 'b'}, {'o'}, 'o = b') # Connect everything r = nstate.add_read('_in') w = nstate.add_write('_out') nstate.add_memlet_path(r, ome, ime, t, dst_conn='b', memlet=inmm) nstate.add_memlet_path(accread, ime, t, dst_conn='a', memlet=dace.Memlet('acc[0]')) nstate.add_memlet_path(t, imx, accwrite, src_conn='o', memlet=dace.Memlet('acc[0]', wcr=node.wcr)) nstate.add_memlet_path(accwrite, omx, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') from dace.transformation import dataflow nsdfg.apply_transformations_repeated(dataflow.MapCollapse) return nsdfg
def _expand_reduce(self, sdfg, state, node): # expands a reduce into two nested maps # taken from legacy expand_reduce.py node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if node.identity is not None: raise ValueError("Node identity has to be None at this point.") else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = Memlet.simple('_out', ','.join( ['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg = state.add_nested_sdfg(nsdfg, sdfg, node.in_connectors, node.out_connectors, schedule=node.schedule, name=node.name) utils.change_edge_dest(state, node, nsdfg) utils.change_edge_src(state, node, nsdfg) state.remove_node(node) return nsdfg
def generate_reference(name, chain): """Generates a simple, unoptimized SDFG to run on the CPU, for verification purposes.""" sdfg = SDFG(name) for k, v in chain.constants.items(): sdfg.add_constant(k, v["value"], dace.data.Scalar(v["data_type"])) (dimensions_to_skip, shape, vector_length, parameters, iterators, memcopy_indices, memcopy_accesses) = _generate_init(chain) prev_state = sdfg.add_state("init") # Throw vectorization in the bin for the reference code vector_length = 1 shape = tuple(map(int, shape)) input_shapes = {} # Maps inputs to their shape tuple for node in chain.graph.nodes(): if isinstance(node, Input) or isinstance(node, Output): if isinstance(node, Input): for output in node.outputs.values(): pars = tuple( output["input_dims"] ) if "input_dims" in output and output[ "input_dims"] is not None else tuple(parameters) arr_shape = tuple(s for s, p in zip(shape, parameters) if p in pars) input_shapes[node.name] = arr_shape break else: raise ValueError("No outputs found for input node.") else: arr_shape = shape if len(arr_shape) > 0: try: sdfg.add_array(node.name, arr_shape, node.data_type) except NameError: sdfg.data( node.name).access = dace.dtypes.AccessType.ReadWrite else: sdfg.add_symbol(node.name, node.data_type) for link in chain.graph.edges(data=True): name = link[0].name if name not in sdfg.arrays and name not in sdfg.symbols: sdfg.add_array(name, shape, link[0].data_type, transient=True) input_shapes[name] = tuple(shape) input_iterators = { k: tuple("0:{}".format(s) for s in v) for k, v in input_shapes.items() } # Enforce dependencies via topological sort for node in nx.topological_sort(chain.graph): if not isinstance(node, Kernel): continue state = sdfg.add_state(node.name) sdfg.add_edge(prev_state, state, dace.InterstateEdge()) (stencil_node, input_to_connector, output_to_connector) = _generate_stencil(node, chain, shape, dimensions_to_skip) stencil_node.implementation = "CPU" for field, connector in input_to_connector.items(): if len(input_iterators[field]) == 0: continue # Scalar variable # Outer memory read read_node = state.add_read(field) state.add_memlet_path(read_node, stencil_node, dst_conn=connector, memlet=Memlet.simple( field, ", ".join(input_iterators[field]))) for _, connector in output_to_connector.items(): # Outer write write_node = state.add_write(node.name) state.add_memlet_path(stencil_node, write_node, src_conn=connector, memlet=Memlet.simple( node.name, ", ".join("0:{}".format(s) for s in shape))) prev_state = state return sdfg