def _create_einsum_internal(sdfg: SDFG, state: SDFGState, einsum_string: str, *arrays: str, dtype: Optional[dtypes.typeclass] = None, optimize: bool = False, output: Optional[str] = None, nodes: Optional[Dict[str, AccessNode]] = None): # Infer shapes and strides of input/output arrays einsum = EinsumParser(einsum_string) if len(einsum.inputs) != len(arrays): raise ValueError('Invalid number of arrays for einsum expression') # Get shapes from arrays and verify dimensionality chardict = {} for inp, inpname in zip(einsum.inputs, arrays): inparr = sdfg.arrays[inpname] if len(inp) != len(inparr.shape): raise ValueError('Dimensionality mismatch in input "%s"' % inpname) for char, shp in zip(inp, inparr.shape): if char in chardict and shp != chardict[char]: raise ValueError('Dimension mismatch in einsum expression') chardict[char] = shp if optimize: # Try to import opt_einsum try: import opt_einsum as oe except (ModuleNotFoundError, NameError, ImportError): raise ImportError('To optimize einsum expressions, please install ' 'the "opt_einsum" package.') for char, shp in chardict.items(): if symbolic.issymbolic(shp): raise ValueError('Einsum optimization cannot be performed ' 'on symbolically-sized array dimension "%s" ' 'for subscript character "%s"' % (shp, char)) # Create optimal contraction path # noinspection PyTypeChecker _, path_info = oe.contract_path( einsum_string, *oe.helpers.build_views(einsum_string, chardict)) input_nodes = nodes or {arr: state.add_read(arr) for arr in arrays} result_node = None # Follow path and create a chain of operation SDFG states for pair, nonfree, expr, after, blas in path_info.contraction_list: result, result_node = _create_einsum_internal(sdfg, state, expr, arrays[pair[0]], arrays[pair[1]], dtype=dtype, optimize=False, output=None, nodes=input_nodes) arrays = ([a for i, a in enumerate(arrays) if i not in pair] + [result]) input_nodes[result] = result_node return arrays[0], result_node # END of einsum optimization input_nodes = nodes or {arr: state.add_read(arr) for arr in arrays} # Get output shape from chardict, or [1] for a scalar output output_shape = list(map(lambda k: chardict[k], einsum.output)) or [1] output_index = ','.join(o for o in einsum.output) or '0' if output is None: dtype = dtype or sdfg.arrays[arrays[0]].dtype output, odesc = sdfg.add_temp_transient(output_shape, dtype) to_init = True else: odesc = sdfg.arrays[output] dtype = dtype or odesc.dtype to_init = False if not einsum.is_bmm(): # Fall back to "pure" SDFG einsum with conflict resolution c = state.add_write(output) # Add state before this one to initialize the output value if to_init: init_state = sdfg.add_state_before(state) if len(einsum.output) > 0: init_state.add_mapped_tasklet( 'einsum_reset', {k: '0:%s' % chardict[k] for k in einsum.output}, {}, 'out_%s = 0' % output, {'out_%s' % output: Memlet.simple(output, output_index)}, external_edges=True) else: # Scalar output t = init_state.add_tasklet('einsum_reset', set(), {'out_%s' % output}, 'out_%s = 0' % output) onode = init_state.add_write(output) init_state.add_edge(t, 'out_%s' % output, onode, None, Memlet.simple(output, '0')) # Pure einsum map state.add_mapped_tasklet( 'einsum', {k: '0:%s' % v for k, v in chardict.items()}, { 'inp_%s' % arr: Memlet.simple(arr, ','.join(inp)) for inp, arr in zip(einsum.inputs, arrays) }, 'out_%s = %s' % (output, ' * '.join('inp_%s' % arr for arr in arrays)), { 'out_%s' % output: Memlet.simple(output, output_index, wcr_str='lambda a,b: a+b') }, input_nodes=input_nodes, output_nodes={output: c}, external_edges=True) else: # Represent einsum as a GEMM or batched GEMM (using library nodes) a_shape = sdfg.arrays[arrays[0]].shape b_shape = sdfg.arrays[arrays[1]].shape c_shape = output_shape a = input_nodes[arrays[0]] b = input_nodes[arrays[1]] c = state.add_write(output) # Compute GEMM dimensions and strides strides = dict( BATCH=prod([c_shape[dim] for dim in einsum.c_batch]), M=prod([a_shape[dim] for dim in einsum.a_only]), K=prod([a_shape[dim] for dim in einsum.a_sum]), N=prod([b_shape[dim] for dim in einsum.b_only]), sAM=prod(a_shape[einsum.a_only[-1] + 1:]) if einsum.a_only else 1, sAK=prod(a_shape[einsum.a_sum[-1] + 1:]) if einsum.a_sum else 1, sAB=prod(a_shape[einsum.a_batch[-1] + 1:]) if einsum.a_batch else 1, sBK=prod(b_shape[einsum.b_sum[-1] + 1:]) if einsum.b_sum else 1, sBN=prod(b_shape[einsum.b_only[-1] + 1:]) if einsum.b_only else 1, sBB=prod(b_shape[einsum.b_batch[-1] + 1:]) if einsum.b_batch else 1, sCM=prod(c_shape[einsum.c_a_only[-1] + 1:]) if einsum.c_a_only else 1, sCN=prod(c_shape[einsum.c_b_only[-1] + 1:]) if einsum.c_b_only else 1, sCB=prod(c_shape[einsum.c_batch[-1] + 1:]) if einsum.c_batch else 1) # Complement strides to make matrices as necessary if len(a_shape) == 1 and len(einsum.a_sum) == 1: strides['sAK'] = 1 strides['sAB'] = strides['sAM'] = strides['K'] if len(b_shape) == 1 and len(einsum.b_sum) == 1: strides['sBN'] = 1 strides['sBK'] = 1 strides['sBB'] = strides['K'] if len(c_shape) == 1 and len(einsum.a_sum) == len(einsum.b_sum): strides['sCN'] = 1 strides['sCB'] = strides['sCM'] = strides['N'] # Create nested SDFG for GEMM nsdfg = create_batch_gemm_sdfg(dtype, strides) nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'X', 'Y'}, {'Z'}, strides) state.add_edge(a, None, nsdfg_node, 'X', Memlet.from_array(a.data, a.desc(sdfg))) state.add_edge(b, None, nsdfg_node, 'Y', Memlet.from_array(b.data, b.desc(sdfg))) state.add_edge(nsdfg_node, 'Z', c, None, Memlet.from_array(c.data, c.desc(sdfg))) return output, c
def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): """ Queries all code generation targets in this SDFG and all nested SDFGs, as well as instrumentation providers, and stores them in the frame code generator. """ disp = frame._dispatcher provider_mapping = InstrumentationProvider.get_provider_mapping() disp.instrumentation[dtypes.InstrumentationType.No_Instrumentation] = None disp.instrumentation[ dtypes.DataInstrumentationType.No_Instrumentation] = None for node, parent in sdfg.all_nodes_recursive(): # Query nodes and scopes if isinstance(node, SDFGState): frame.targets.add(disp.get_state_dispatcher(parent, node)) elif isinstance(node, dace.nodes.EntryNode): frame.targets.add(disp.get_scope_dispatcher(node.schedule)) elif isinstance(node, dace.nodes.Node): state: SDFGState = parent nsdfg = state.parent frame.targets.add(disp.get_node_dispatcher(nsdfg, state, node)) # Array allocation if isinstance(node, dace.nodes.AccessNode): state: SDFGState = parent nsdfg = state.parent desc = node.desc(nsdfg) frame.targets.add(disp.get_array_dispatcher(desc.storage)) # Copies and memlets - via access nodes and tasklets # To avoid duplicate checks, only look at outgoing edges of access nodes and tasklets if isinstance(node, (dace.nodes.AccessNode, dace.nodes.Tasklet)): state: SDFGState = parent for e in state.out_edges(node): if e.data.is_empty(): continue mtree = state.memlet_tree(e) if mtree.downwards: # Rooted at src_node for leaf_e in mtree.leaves(): dst_node = leaf_e.dst if leaf_e.data.is_empty(): continue tgt = disp.get_copy_dispatcher(node, dst_node, leaf_e, state.parent, state) if tgt is not None: frame.targets.add(tgt) else: # Rooted at dst_node dst_node = mtree.root().edge.dst tgt = disp.get_copy_dispatcher(node, dst_node, e, state.parent, state) if tgt is not None: frame.targets.add(tgt) # Instrumentation-related query if hasattr(node, 'instrument'): disp.instrumentation[node.instrument] = provider_mapping[ node.instrument] elif hasattr(node, 'consume'): disp.instrumentation[node.consume.instrument] = provider_mapping[ node.consume.instrument] elif hasattr(node, 'map'): disp.instrumentation[node.map.instrument] = provider_mapping[ node.map.instrument] # Query instrumentation provider of SDFG if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: disp.instrumentation[sdfg.instrument] = provider_mapping[ sdfg.instrument]
def apply(self, sdfg: sd.SDFG): #################################################################### # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride condition_edge = sdfg.edges_between(guard, begin)[0] not_condition_edge = sdfg.edges_between(guard, after_state)[0] itervar, rng, loop_struct = find_for_loop(sdfg, guard, begin) # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_state = loop_struct[1] last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) #################################################################### # Transform if self.begin: # If begin, change initialization assignment and prepend states before # guard init_edges = [] before_states = loop_struct[0] for before_state in before_states: init_edge = sdfg.edges_between(before_state, guard)[0] init_edge.data.assignments[itervar] = str(rng[0] + self.count * rng[2]) init_edges.append(init_edge) append_states = before_states # Add `count` states, each with instantiated iteration variable for i in range(self.count): # Instantiate loop states with iterate value state_name: str = 'start_' + itervar + str(i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, rng[0] + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges for append_state in append_states: sdfg.add_edge(append_state, new_states[first_id], sd.InterstateEdge()) append_states = [new_states[last_id]] # Reconnect edge to guard state from last peeled iteration for append_state in append_states: if append_state not in before_states: for init_edge in init_edges: sdfg.remove_edge(init_edge) sdfg.add_edge(append_state, guard, init_edges[0].data) else: # If begin, change initialization assignment and prepend states before # guard itervar_sym = pystr_to_symbolic(itervar) condition_edge.data.condition = CodeBlock( self._modify_cond(condition_edge.data.condition, itervar, rng[2])) not_condition_edge.data.condition = CodeBlock( self._modify_cond(not_condition_edge.data.condition, itervar, rng[2])) prepend_state = after_state # Add `count` states, each with instantiated iteration variable for i in reversed(range(self.count)): # Instantiate loop states with iterate value state_name: str = 'end_' + itervar + str(-i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, itervar_sym + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges sdfg.add_edge(new_states[last_id], prepend_state, sd.InterstateEdge()) prepend_state = new_states[first_id] # Reconnect edge to guard state from last peeled iteration if prepend_state != after_state: sdfg.remove_edge(not_condition_edge) sdfg.add_edge(guard, prepend_state, not_condition_edge.data)
def apply_to(cls, sdfg: SDFG, *where: Union[nd.Node, SDFGState, gr.SubgraphView], verify: bool = True, **options: Any): """ Applies this transformation to a given subgraph, defined by a set of nodes. Raises an error if arguments are invalid or transformation is not applicable. To apply the transformation on a specific subgraph, the `where` parameter can be used either on a subgraph object (`SubgraphView`), or on directly on a list of subgraph nodes, given as `Node` or `SDFGState` objects. Transformation properties can then be given as keyword arguments. For example, applying `SubgraphFusion` on a subgraph of three nodes can be called in one of two ways: ``` # Subgraph SubgraphFusion.apply_to( sdfg, SubgraphView(state, [node_a, node_b, node_c])) # Simplified API: list of nodes SubgraphFusion.apply_to(sdfg, node_a, node_b, node_c) ``` :param sdfg: The SDFG to apply the transformation to. :param where: A set of nodes in the SDFG/state, or a subgraph thereof. :param verify: Check that `can_be_applied` returns True before applying. :param options: A set of parameters to use for applying the transformation. """ subgraph = None if len(where) == 1: if isinstance(where[0], (list, tuple)): where = where[0] elif isinstance(where[0], gr.SubgraphView): subgraph = where[0] if len(where) == 0: raise ValueError('At least one node is required') # Check that all keyword arguments are nodes and if interstate or not if subgraph is None: sample_node = where[0] if isinstance(sample_node, SDFGState): graph = sdfg state_id = -1 elif isinstance(sample_node, nd.Node): graph = next(s for s in sdfg.nodes() if sample_node in s.nodes()) state_id = sdfg.node_id(graph) else: raise TypeError('Invalid node type "%s"' % type(sample_node).__name__) # Construct subgraph and instantiate transformation subgraph = gr.SubgraphView(graph, where) instance = cls(subgraph, sdfg.sdfg_id, state_id) else: # Construct instance from subgraph directly instance = cls(subgraph) # Construct transformation parameters for optname, optval in options.items(): if not optname in cls.__properties__: raise ValueError('Property "%s" not found in transformation' % optname) setattr(instance, optname, optval) if verify: if not cls.can_be_applied(sdfg, subgraph): raise ValueError('Transformation cannot be applied on the ' 'given subgraph ("can_be_applied" failed)') # Apply to SDFG return instance.apply(sdfg)
def generate_code(sdfg, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :param validate: If True, validates the SDFG before generating the code. :return: List of code objects that correspond to files to compile. """ from dace.codegen.targets.target import TargetCodeGenerator # Avoid import loop # Before compiling, validate SDFG correctness if validate: sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp import shutil import tempfile with tempfile.TemporaryDirectory() as tmp_dir: sdfg.save(f'{tmp_dir}/test.sdfg') sdfg2 = SDFG.from_file(f'{tmp_dir}/test.sdfg') sdfg2.save(f'{tmp_dir}/test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp(f'{tmp_dir}/test.sdfg', f'{tmp_dir}/test2.sdfg'): shutil.move(f"{tmp_dir}/test.sdfg", "test.sdfg") shutil.move(f"{tmp_dir}/test2.sdfg", "test2.sdfg") raise RuntimeError( 'SDFG serialization failed - files do not match') # Run with the deserialized version # NOTE: This means that all subsequent modifications to `sdfg` # are not reflected outside of this function (e.g., library # node expansion). sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) # Set default storage/schedule types in SDFG infer_types.set_default_schedule_and_storage_types(sdfg, None) # Recursively expand library nodes that have not yet been expanded sdfg.expand_library_nodes() # After expansion, run another pass of connector/type inference infer_types.infer_connector_types(sdfg) infer_types.set_default_schedule_and_storage_types(sdfg, None) frame = framecode.DaCeCodeGenerator(sdfg) # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Query all code generation targets and instrumentation providers in SDFG _get_codegen_targets(sdfg, frame) # Preprocess SDFG for target in frame.targets: target.preprocess(sdfg) # Instantiate instrumentation providers frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # NOTE: THE SDFG IS ASSUMED TO BE FROZEN (not change) FROM THIS POINT ONWARDS # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments, sdfg=sdfg) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # Ensure that no new targets were dynamically added assert frame._dispatcher.used_targets == (frame.targets - {frame}) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg, frame), 'h', cpu.CPUCodeGen, 'CallHeader', target_type='../../include', linkable=False) target_objects.append(dummy) for env in dace.library.get_environments_and_dependencies( used_environments): if hasattr(env, "codeobjects"): target_objects.extend(env.codeobjects) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg, frame), 'cpp', cpu.CPUCodeGen, 'SampleMain', target_type='../../sample', linkable=False) target_objects.append(dummy) return target_objects
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG, partial_width=16): ''' :param node: the node to expand :param state: the state in which the node is in :param sdfg: the SDFG in which the node is in :param partial_width: Width of the inner reduction buffer. Must be larger than the latency of the reduction operation on the given data type ''' node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) if input_data.dtype.veclen > 1: raise NotImplementedError( 'Vectorization currently not implemented for FPGA expansion of Reduce.' ) nstate = nsdfg.add_state() # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: all_axis = False # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append(f'_i{ictr}') ictr += 1 else: input_subset.append(f'_o{octr}') octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { f'_o{i}': f'0:{symstr(sz)}' for i, sz in enumerate(outedge.data.subset.size()) }) outm_idx = ','.join([f'_o{i}' for i in range(output_dims)]) outm = dace.Memlet(f'_out[{outm_idx}]') inm_idx = ','.join(input_subset) inmm = dace.Memlet(f'_in[{inm_idx}]') else: all_axis = True ome, omx = None, None outm = dace.Memlet('_out[0]') inm_idx = ','.join([f'_i{i}' for i in range(len(axes))]) inmm = dace.Memlet(f'_in[{inm_idx}]') # Add inner map, which corresponds to the range to reduce r = nstate.add_read('_in') w = nstate.add_read('_out') # TODO support vectorization buffer_name = 'partial_results' nsdfg.add_array(buffer_name, (partial_width, ), input_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) buffer = nstate.add_access(buffer_name) buffer_write = nstate.add_write(buffer_name) # Initialize explicitly partial results, as the inner map could run for a number of iteration < partial_width init_me, init_mx = nstate.add_map( 'partial_results_init', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) init_tasklet = nstate.add_tasklet('init_pr', {}, {'pr_out'}, f'pr_out = {node.identity}') nstate.add_memlet_path(init_me, init_tasklet, memlet=dace.Memlet()) nstate.add_memlet_path(init_tasklet, init_mx, buffer, src_conn='pr_out', memlet=dace.Memlet(f'{buffer_name}[i]')) if not all_axis: nstate.add_memlet_path(ome, init_me, memlet=dace.Memlet()) ime, imx = nstate.add_map( 'reduce_values', { f'_i{i}': f'0:{symstr(inedge.data.subset.size()[axis])}' for i, axis in enumerate(sorted(axes)) }) # Accumulate over partial results redtype = detect_reduction_type(node.wcr) if redtype not in ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR: raise ValueError('Reduction type not supported for "%s"' % node.wcr) else: reduction_expr = ExpandReduceFPGAPartialReduction._REDUCTION_TYPE_EXPR[ redtype] # generate flatten index considering inner map: will be used for indexing into partial results ranges_size = ime.range.size() inner_index = '+'.join( [f'_i{i} * {ranges_size[i + 1]}' for i in range(len(axes) - 1)]) inner_op = ' + ' if len(axes) > 1 else '' inner_index = inner_index + f'{inner_op}_i{(len(axes) - 1)}' partial_reduce_tasklet = nstate.add_tasklet( 'partial_reduce', {'data_in', 'buffer_in'}, {'buffer_out'}, f'''\ prev = buffer_in buffer_out = {reduction_expr}''') if not all_axis: # Connect input and partial sums nstate.add_memlet_path(r, ome, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) else: nstate.add_memlet_path(r, ime, partial_reduce_tasklet, dst_conn='data_in', memlet=inmm) nstate.add_memlet_path( buffer, ime, partial_reduce_tasklet, dst_conn='buffer_in', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) nstate.add_memlet_path( partial_reduce_tasklet, imx, buffer_write, src_conn='buffer_out', memlet=dace.Memlet( f'{buffer_name}[({inner_index})%{partial_width}]')) # Then perform reduction on partial results reduce_entry, reduce_exit = nstate.add_map( 'reduce', {'i': f'0:{partial_width}'}, schedule=dtypes.ScheduleType.FPGA_Device, unroll=True) reduce_tasklet = nstate.add_tasklet( 'reduce', {'reduce_in', 'data_in'}, {'reduce_out'}, f'''\ prev = reduce_in if i > 0 else {node.identity} reduce_out = {reduction_expr}''') nstate.add_memlet_path(buffer_write, reduce_entry, reduce_tasklet, dst_conn='data_in', memlet=dace.Memlet(f'{buffer_name}[i]')) reduce_name = 'reduce_result' nsdfg.add_array(reduce_name, (1, ), output_data.dtype, transient=True, storage=dtypes.StorageType.FPGA_Local) reduce_read = nstate.add_access(reduce_name) reduce_access = nstate.add_access(reduce_name) if not all_axis: nstate.add_memlet_path(ome, reduce_read, memlet=dace.Memlet()) nstate.add_memlet_path(reduce_read, reduce_entry, reduce_tasklet, dst_conn='reduce_in', memlet=dace.Memlet(f'{reduce_name}[0]')) nstate.add_memlet_path(reduce_tasklet, reduce_exit, reduce_access, src_conn='reduce_out', memlet=dace.Memlet(f'{reduce_name}[0]')) if not all_axis: # Write out the result nstate.add_memlet_path(reduce_access, omx, w, memlet=outm) else: nstate.add_memlet_path(reduce_access, w, memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') nsdfg.validate() return nsdfg
def apply_to(cls, sdfg: SDFG, options: Optional[Dict[str, Any]] = None, expr_index: int = 0, verify: bool = True, strict: bool = False, save: bool = True, **where: Union[nd.Node, SDFGState]): """ Applies this transformation to a given subgraph, defined by a set of nodes. Raises an error if arguments are invalid or transformation is not applicable. The subgraph is defined by the `where` dictionary, where each key is taken from the `PatternNode` fields of the transformation. For example, applying `MapCollapse` on two maps can pe performed as follows: ``` MapCollapse.apply_to(sdfg, outer_map_entry=map_a, inner_map_entry=map_b) ``` :param sdfg: The SDFG to apply the transformation to. :param options: A set of parameters to use for applying the transformation. :param expr_index: The pattern expression index to try to match with. :param verify: Check that `can_be_applied` returns True before applying. :param strict: Apply transformation in strict mode. :param save: Save transformation as part of the SDFG file. Set to False if composing transformations. :param where: A dictionary of node names (from the transformation) to nodes in the SDFG or a single state. """ if len(where) == 0: raise ValueError('At least one node is required') options = options or {} # Check that all keyword arguments are nodes and if interstate or not sample_node = next(iter(where.values())) if isinstance(sample_node, SDFGState): graph = sdfg state_id = -1 elif isinstance(sample_node, nd.Node): graph = next(s for s in sdfg.nodes() if sample_node in s.nodes()) state_id = sdfg.node_id(graph) else: raise TypeError('Invalid node type "%s"' % type(sample_node).__name__) # Check that all nodes in the pattern are set required_nodes = cls.expressions()[expr_index].nodes() required_node_names = { pname: pval for pname, pval in cls._get_pattern_nodes().items() if pval in required_nodes } required = set(required_node_names.keys()) intersection = required & set(where.keys()) if len(required - intersection) > 0: raise ValueError('Missing nodes for transformation subgraph: %s' % (required - intersection)) # Construct subgraph and instantiate transformation subgraph = { required_node_names[k]: graph.node_id(where[k]) for k in required } instance = cls(sdfg.sdfg_id, state_id, subgraph, expr_index) # Construct transformation parameters for optname, optval in options.items(): if not optname in cls.__properties__: raise ValueError('Property "%s" not found in transformation' % optname) setattr(instance, optname, optval) if verify: if not cls.can_be_applied( graph, subgraph, expr_index, sdfg, strict=strict): raise ValueError('Transformation cannot be applied on the ' 'given subgraph ("can_be_applied" failed)') # Apply to SDFG return instance.apply_pattern(sdfg, append=save)
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data output_type = 'dace::vec<%s, %s>' % ( sdfg.arrays[output_memlet.data].dtype.ctype, output_memlet.veclen) if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in'}, {'_out'}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') # HACK: Workaround to avoid issues with code generator inferring reads # and writes when it shouldn't. input_edge.data.num_accesses = dtypes.DYNAMIC output_edge.data.num_accesses = dtypes.DYNAMIC return tnode
def apply(self, sdfg: SDFG): outer_state: SDFGState = sdfg.nodes()[self.state_id] nsdfg_node = self.nested_sdfg(sdfg) nsdfg: SDFG = nsdfg_node.sdfg if nsdfg_node.schedule is not dtypes.ScheduleType.Default: infer_types.set_default_schedule_and_storage_types( nsdfg, nsdfg_node.schedule) ####################################################### # Collect and update top-level SDFG metadata # Global/init/exit code for loc, code in nsdfg.global_code.items(): sdfg.append_global_code(code.code, loc) for loc, code in nsdfg.init_code.items(): sdfg.append_init_code(code.code, loc) for loc, code in nsdfg.exit_code.items(): sdfg.append_exit_code(code.code, loc) # Environments for nstate in nsdfg.nodes(): for node in nstate.nodes(): if isinstance(node, nodes.CodeNode): node.environments |= nsdfg_node.environments # Constants for cstname, cstval in nsdfg.constants.items(): if cstname in sdfg.constants: if cstval != sdfg.constants[cstname]: warnings.warn('Constant value mismatch for "%s" while ' 'inlining SDFG. Inner = %s != %s = outer' % (cstname, cstval, sdfg.constants[cstname])) else: sdfg.add_constant(cstname, cstval) # Find original source/destination edges (there is only one edge per # connector, according to match) inputs: Dict[str, MultiConnectorEdge] = {} outputs: Dict[str, MultiConnectorEdge] = {} input_set: Dict[str, str] = {} output_set: Dict[str, str] = {} for e in outer_state.in_edges(nsdfg_node): inputs[e.dst_conn] = e input_set[e.data.data] = e.dst_conn for e in outer_state.out_edges(nsdfg_node): outputs[e.src_conn] = e output_set[e.data.data] = e.src_conn # Replace symbols using invocation symbol mapping # Two-step replacement (N -> __dacesym_N --> map[N]) to avoid clashes symbolic.safe_replace(nsdfg_node.symbol_mapping, nsdfg.replace_dict) # Access nodes that need to be reshaped # reshapes: Set(str) = set() # for aname, array in nsdfg.arrays.items(): # if array.transient: # continue # edge = None # if aname in inputs: # edge = inputs[aname] # if len(array.shape) > len(edge.data.subset): # reshapes.add(aname) # continue # if aname in outputs: # edge = outputs[aname] # if len(array.shape) > len(edge.data.subset): # reshapes.add(aname) # continue # if edge is not None and not InlineMultistateSDFG._check_strides( # array.strides, sdfg.arrays[edge.data.data].strides, # edge.data, nsdfg_node): # reshapes.add(aname) # Mapping from nested transient name to top-level name transients: Dict[str, str] = {} # All transients become transients of the parent (if data already # exists, find new name) for nstate in nsdfg.nodes(): for node in nstate.nodes(): if isinstance(node, nodes.AccessNode): datadesc = nsdfg.arrays[node.data] if node.data not in transients and datadesc.transient: new_name = node.data if (new_name in sdfg.arrays or new_name in sdfg.symbols or new_name in sdfg.constants): new_name = f'{nsdfg.label}_{node.data}' name = sdfg.add_datadesc(new_name, datadesc, find_new_name=True) transients[node.data] = name # All transients of edges between code nodes are also added to parent for edge in nstate.edges(): if (isinstance(edge.src, nodes.CodeNode) and isinstance(edge.dst, nodes.CodeNode)): if edge.data.data is not None: datadesc = nsdfg.arrays[edge.data.data] if edge.data.data not in transients and datadesc.transient: new_name = edge.data.data if (new_name in sdfg.arrays or new_name in sdfg.symbols or new_name in sdfg.constants): new_name = f'{nsdfg.label}_{edge.data.data}' name = sdfg.add_datadesc(new_name, datadesc, find_new_name=True) transients[edge.data.data] = name ####################################################### # Replace data on inlined SDFG nodes/edges # Replace data names with their top-level counterparts repldict = {} repldict.update(transients) repldict.update({ k: v.data.data for k, v in itertools.chain(inputs.items(), outputs.items()) }) symbolic.safe_replace(repldict, nsdfg.replace_dict, value_as_string=True) # Add views whenever reshapes are necessary # for dname in reshapes: # desc = nsdfg.arrays[dname] # # To avoid potential confusion, rename protected __return keyword # if dname.startswith('__return'): # newname = f'{nsdfg.name}_ret{dname[8:]}' # else: # newname = dname # newname, _ = sdfg.add_view(newname, # desc.shape, # desc.dtype, # storage=desc.storage, # strides=desc.strides, # offset=desc.offset, # debuginfo=desc.debuginfo, # allow_conflicts=desc.allow_conflicts, # total_size=desc.total_size, # alignment=desc.alignment, # may_alias=desc.may_alias, # find_new_name=True) # repldict[dname] = newname # Add extra access nodes for out/in view nodes # inv_reshapes = {repldict[r]: r for r in reshapes} # for nstate in nsdfg.nodes(): # for node in nstate.nodes(): # if isinstance(node, # nodes.AccessNode) and node.data in inv_reshapes: # if nstate.in_degree(node) > 0 and nstate.out_degree( # node) > 0: # # Such a node has to be in the output set # edge = outputs[inv_reshapes[node.data]] # # Redirect outgoing edges through access node # out_edges = list(nstate.out_edges(node)) # anode = nstate.add_access(edge.data.data) # vnode = nstate.add_access(node.data) # nstate.add_nedge(node, anode, edge.data) # nstate.add_nedge(anode, vnode, edge.data) # for e in out_edges: # nstate.remove_edge(e) # nstate.add_edge(vnode, e.src_conn, e.dst, # e.dst_conn, e.data) # Make unique names for states statenames = set(s.label for s in sdfg.nodes()) for nstate in nsdfg.nodes(): if nstate.label in statenames: newname = data.find_new_name(nstate.label, statenames) statenames.add(newname) nstate.set_label(newname) ####################################################### # Collect and modify interstate edges as necessary outer_assignments = set() for e in sdfg.edges(): outer_assignments |= e.data.assignments.keys() inner_assignments = set() for e in nsdfg.edges(): inner_assignments |= e.data.assignments.keys() assignments_to_replace = inner_assignments & outer_assignments sym_replacements: Dict[str, str] = {} allnames = set(sdfg.symbols.keys()) | set(sdfg.arrays.keys()) for assign in assignments_to_replace: newname = data.find_new_name(assign, allnames) allnames.add(newname) sym_replacements[assign] = newname nsdfg.replace_dict(sym_replacements) ####################################################### # Add nested SDFG states into top-level SDFG outer_start_state = sdfg.start_state sdfg.add_nodes_from(nsdfg.nodes()) for ise in nsdfg.edges(): sdfg.add_edge(ise.src, ise.dst, ise.data) ####################################################### # Reconnect inlined SDFG source = nsdfg.start_state sinks = nsdfg.sink_nodes() # Reconnect state machine for e in sdfg.in_edges(outer_state): sdfg.add_edge(e.src, source, e.data) for e in sdfg.out_edges(outer_state): for sink in sinks: sdfg.add_edge(sink, e.dst, e.data) # Modify start state as necessary if outer_start_state is outer_state: sdfg.start_state = sdfg.node_id(source) # TODO: Modify memlets by offsetting # If both source and sink nodes are inputs/outputs, reconnect once # edges_to_ignore = self._modify_access_to_access(new_incoming_edges, # nsdfg, nstate, state, # orig_data) # source_to_outer = {n: e.src for n, e in new_incoming_edges.items()} # sink_to_outer = {n: e.dst for n, e in new_outgoing_edges.items()} # # If a source/sink node is one of the inputs/outputs, reconnect it, # # replacing memlets in outgoing/incoming paths # modified_edges = set() # modified_edges |= self._modify_memlet_path(new_incoming_edges, nstate, # state, sink_to_outer, True, # edges_to_ignore) # modified_edges |= self._modify_memlet_path(new_outgoing_edges, nstate, # state, source_to_outer, # False, edges_to_ignore) # # Reshape: add connections to viewed data # self._modify_reshape_data(reshapes, repldict, inputs, nstate, state, # True) # self._modify_reshape_data(reshapes, repldict, outputs, nstate, state, # False) # Modify all other internal edges pertaining to input/output nodes # for nstate in nsdfg.nodes(): # for node in nstate.nodes(): # if isinstance(node, nodes.AccessNode): # if node.data in input_set or node.data in output_set: # if node.data in input_set: # outer_edge = inputs[input_set[node.data]] # else: # outer_edge = outputs[output_set[node.data]] # for edge in state.all_edges(node): # if (edge not in modified_edges # and edge.data.data == node.data): # for e in state.memlet_tree(edge): # if e.data.data == node.data: # e._data = helpers.unsqueeze_memlet( # e.data, outer_edge.data) # Replace nested SDFG parents with new SDFG for nstate in nsdfg.nodes(): nstate.parent = sdfg for node in nstate.nodes(): if isinstance(node, nodes.NestedSDFG): node.sdfg.parent_sdfg = sdfg node.sdfg.parent_nsdfg_node = node ####################################################### # Remove nested SDFG and state sdfg.remove_node(outer_state) return nsdfg.nodes()
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) output_memlet = output_edge.data output_type = 'dace::vec<%s, %s>' % ( sdfg.arrays[output_memlet.data].dtype.ctype, output_memlet.veclen) if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in'}, {'_out'}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') # HACK: Workaround to avoid issues with code generator inferring reads # and writes when it shouldn't. input_edge.data.num_accesses = dtypes.DYNAMIC output_edge.data.num_accesses = dtypes.DYNAMIC return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) # If identity is defined, add an initialization state if node.identity is not None: init_state = nsdfg.add_state() nstate = nsdfg.add_state() nsdfg.add_edge(init_state, nstate, dace.InterstateEdge()) # Add initialization as a map init_state.add_mapped_tasklet( 'reduce_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size()) }, {}, 'out = %s' % node.identity, { 'out': dace.Memlet.simple( '_out', ','.join( ['_o%d' % i for i in range(output_dims)])) }, external_edges=True) else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = dace.Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return nsdfg
from dace.sdfg import SDFG from dace.memlet import Memlet # Constructs an SDFG with two consecutive tasklets if __name__ == '__main__': print('Multidimensional offset and stride test') # Externals (parameters, symbols) N = dp.symbol('N') N.set(20) input = dp.ndarray([N, N], dp.float32) output = dp.ndarray([4, 3], dp.float32) input[:] = (np.random.rand(N.get(), N.get()) * 5).astype(dp.float32.type) output[:] = dp.float32(0) # Construct SDFG mysdfg = SDFG('offset_stride') state = mysdfg.add_state() A_ = state.add_array('A', [6, 6], dp.float32, offset=[2, 3], strides=[N, 1], total_size=N * N) B_ = state.add_array('B', [3, 2], dp.float32, offset=[-1, -1], strides=[3, 1], total_size=12) map_entry, map_exit = state.add_map('mymap', [('i', '1:4'), ('j', '1:3')]) tasklet = state.add_tasklet('mytasklet', {'a'}, {'b'}, 'b = a') state.add_edge(map_entry, None, tasklet, 'a', Memlet.simple(A_, 'i,j'))
def state_parent_tree(sdfg: SDFG) -> Dict[SDFGState, SDFGState]: """ Computes an upward-pointing tree of each state, pointing to the "parent state" it belongs to (in terms of structured control flow). More formally, each state is either mapped to its immediate dominator with out degree > 2, one state upwards if state occurs after a loop, or the start state if no such states exist. :param sdfg: The SDFG to analyze. :return: A dictionary that maps each state to a parent state, or None if the root (start) state. """ idom = nx.immediate_dominators(sdfg.nx, sdfg.start_state) alldoms = all_dominators(sdfg, idom) loopexits: Dict[SDFGState, SDFGState] = {} # First, annotate loops for state in sdfg: loopexits[state] = None for cycle in sdfg.find_cycles(): for v in cycle: if loopexits[v] is not None: continue # Natural loops = one edge leads back to loop, another leads out in_edges = sdfg.in_edges(v) out_edges = sdfg.out_edges(v) # A loop guard has two or more incoming edges (1 increment and # n init, all identical), and exactly two outgoing edges (loop and # exit loop). if len(in_edges) < 2 or len(out_edges) != 2: continue # The outgoing edges must be negations of one another. if out_edges[0].data.condition_sympy() != (sp.Not( out_edges[1].data.condition_sympy())): continue # Make sure the entire cycle is dominated by this node. If not, # we're looking at a guard for a nested cycle, which we ignore for # this cycle. if any(v not in alldoms[u] for u in cycle if u is not v): continue loop_state = None exit_state = None if out_edges[0].dst in cycle and out_edges[1].dst not in cycle: loop_state = out_edges[0].dst exit_state = out_edges[1].dst elif out_edges[1].dst in cycle and out_edges[0].dst not in cycle: loop_state = out_edges[1].dst exit_state = out_edges[0].dst if loop_state is None or exit_state is None: continue loopexits[v] = exit_state # Get dominators parents: Dict[SDFGState, SDFGState] = {} step_up: Set[SDFGState] = set() for state in sdfg.nodes(): curdom = idom[state] if curdom == state: parents[state] = None continue while curdom != idom[curdom]: if sdfg.out_degree(curdom) > 1: break curdom = idom[curdom] if sdfg.out_degree(curdom) == 2 and loopexits[curdom] is not None: p = state while p != curdom and p != loopexits[curdom]: p = idom[p] if p == loopexits[curdom]: # Dominated by loop exit: do one more step up step_up.add(state) parents[state] = curdom # Step up for state in step_up: if parents[state] is not None: parents[state] = parents[parents[state]] return parents
def _stateorder_topological_sort( sdfg: SDFG, start: SDFGState, ptree: Dict[SDFGState, SDFGState], branch_merges: Dict[SDFGState, SDFGState], stop: SDFGState = None) -> Iterator[SDFGState]: """ Helper function for ``stateorder_topological_sort``. :param sdfg: SDFG. :param start: Starting state for traversal. :param ptree: State parent tree (computed from ``state_parent_tree``). :param branch_merges: Dictionary mapping from branch state to its merge state. :param stop: Stopping state to not traverse through (merge state of a branch or guard state of a loop). :return: Generator that yields states in state-order from ``start`` to ``stop``. """ # Traverse states in custom order visited = set() if stop is not None: visited.add(stop) stack = [start] while stack: node = stack.pop() if node in visited: continue yield node oe = sdfg.out_edges(node) if len(oe) == 0: # End state continue elif len(oe) == 1: # No traversal change stack.append(oe[0].dst) continue elif len(oe) == 2: # Loop or branch # If loop, traverse body, then exit if ptree[oe[0].dst] == node and ptree[oe[1].dst] != node: for s in _stateorder_topological_sort(sdfg, oe[0].dst, ptree, branch_merges, stop=node): yield s visited.add(s) stack.append(oe[1].dst) continue elif ptree[oe[1].dst] == node and ptree[oe[0].dst] != node: for s in _stateorder_topological_sort(sdfg, oe[1].dst, ptree, branch_merges, stop=node): yield s visited.add(s) stack.append(oe[0].dst) continue # Otherwise, passthrough to branch # Branch mergestate = branch_merges[node] for branch in oe: for s in _stateorder_topological_sort(sdfg, branch.dst, ptree, branch_merges, stop=mergestate): yield s visited.add(s) if mergestate != stop: stack.append(mergestate)
def apply(self, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) body: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride itervar, (start, end, step), _ = find_for_loop(sdfg, guard, body) if (step < 0) == True: # If step is negative, we have to flip start and end to produce a # correct map with a positive increment start, end, step = end, start, -step # If necessary, make a nested SDFG with assignments isedge = sdfg.edges_between(guard, body)[0] symbols_to_remove = set() if len(isedge.data.assignments) > 0: nsdfg = helpers.nest_state_subgraph( sdfg, body, gr.SubgraphView(body, body.nodes())) for sym in isedge.data.free_symbols: if sym in nsdfg.symbol_mapping or sym in nsdfg.in_connectors: continue if sym in sdfg.symbols: nsdfg.symbol_mapping[sym] = symbolic.pystr_to_symbolic(sym) nsdfg.sdfg.add_symbol(sym, sdfg.symbols[sym]) elif sym in sdfg.arrays: if sym in nsdfg.sdfg.arrays: raise NotImplementedError rnode = body.add_read(sym) nsdfg.add_in_connector(sym) desc = copy.deepcopy(sdfg.arrays[sym]) desc.transient = False nsdfg.sdfg.add_datadesc(sym, desc) body.add_edge(rnode, None, nsdfg, sym, memlet.Memlet(sym)) nstate = nsdfg.sdfg.node(0) init_state = nsdfg.sdfg.add_state_before(nstate) nisedge = nsdfg.sdfg.edges_between(init_state, nstate)[0] nisedge.data.assignments = isedge.data.assignments symbols_to_remove = set(nisedge.data.assignments.keys()) for k in nisedge.data.assignments.keys(): if k in nsdfg.symbol_mapping: del nsdfg.symbol_mapping[k] isedge.data.assignments = {} source_nodes = body.source_nodes() sink_nodes = body.sink_nodes() map = nodes.Map(body.label + "_map", [itervar], [(start, end, step)]) entry = nodes.MapEntry(map) exit = nodes.MapExit(map) body.add_node(entry) body.add_node(exit) # If the map uses symbols from data containers, instantiate reads containers_to_read = entry.free_symbols & sdfg.arrays.keys() for rd in containers_to_read: # We are guaranteed that this is always a scalar, because # can_be_applied makes sure there are no sympy functions in each of # the loop expresions access_node = body.add_read(rd) body.add_memlet_path(access_node, entry, dst_conn=rd, memlet=memlet.Memlet(rd)) # Reroute all memlets through the entry and exit nodes for n in source_nodes: if isinstance(n, nodes.AccessNode): for e in body.out_edges(n): body.remove_edge(e) body.add_edge_pair(entry, e.dst, n, e.data, internal_connector=e.dst_conn) else: body.add_nedge(entry, n, memlet.Memlet()) for n in sink_nodes: if isinstance(n, nodes.AccessNode): for e in body.in_edges(n): body.remove_edge(e) body.add_edge_pair(exit, e.src, n, e.data, internal_connector=e.src_conn) else: body.add_nedge(n, exit, memlet.Memlet()) # Get rid of the loop exit condition edge after_edge = sdfg.edges_between(guard, after)[0] sdfg.remove_edge(after_edge) # Remove the assignment on the edge to the guard for e in sdfg.in_edges(guard): if itervar in e.data.assignments: del e.data.assignments[itervar] # Remove the condition on the entry edge condition_edge = sdfg.edges_between(guard, body)[0] condition_edge.data.condition = CodeBlock("1") # Get rid of backedge to guard sdfg.remove_edge(sdfg.edges_between(body, guard)[0]) # Route body directly to after state, maintaining any other assignments # it might have had sdfg.add_edge( body, after, sd.InterstateEdge(assignments=after_edge.data.assignments)) # If this had made the iteration variable a free symbol, we can remove # it from the SDFG symbols if itervar in sdfg.free_symbols: sdfg.remove_symbol(itervar) for sym in symbols_to_remove: if helpers.is_symbol_unused(sdfg, sym): sdfg.remove_symbol(sym)
def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet = graph.nodes()[self.subgraph[Vectorization._tasklet]] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len dim_from, dim_to, _ = map_entry.map.range[-1] # Determine whether to create preamble or postamble maps if self.preamble is not None: create_preamble = self.preamble else: create_preamble = not ((dim_from % vector_size == 0) == True or dim_from == 0) if self.postamble is not None: create_postamble = self.postamble else: if isinstance(dim_to, symbolic.SymExpr): create_postamble = (((dim_to.approx + 1) % vector_size == 0) == False) else: create_postamble = (((dim_to + 1) % vector_size == 0) == False) # Determine new range for vectorized map if self.strided_map: new_range = [dim_from, dim_to - vector_size + 1, vector_size] else: new_range = [ dim_from // vector_size, ((dim_to + 1) // vector_size) - 1, 1 ] # Create preamble non-vectorized map (replacing the original map) if create_preamble: old_scope = graph.scope_subgraph(map_entry, True, True) new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, old_scope) new_begin = dim_from + (vector_size - (dim_from % vector_size)) map_entry.map.range[-1] = (dim_from, new_begin - 1, 1) # Replace map_entry with the replicated scope (so that the preamble # will usually come first in topological sort) map_entry = new_scope.entry tasklet = new_scope.nodes()[old_scope.nodes().index(tasklet)] new_range[0] = new_begin # Create postamble non-vectorized map if create_postamble: new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, graph.scope_subgraph(map_entry, True, True)) dim_to_ex = dim_to + 1 new_scope.entry.map.range[-1] = (dim_to_ex - (dim_to_ex % vector_size), dim_to, 1) # Change the step of the inner-most dimension. map_entry.map.range[-1] = tuple(new_range) # Vectorize connectors adjacent to the tasklet. for edge in graph.all_edges(tasklet): connectors = (tasklet.in_connectors if edge.dst == tasklet else tasklet.out_connectors) conn = edge.dst_conn if edge.dst == tasklet else edge.src_conn if edge.data.data is None: # Empty memlets continue desc = sdfg.arrays[edge.data.data] contigidx = desc.strides.index(1) newlist = [] lastindex = edge.data.subset[contigidx] if isinstance(lastindex, tuple): newlist = [(rb, re, rs) for rb, re, rs in edge.data.subset] symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: newlist = [(rb, rb, 1) for rb in edge.data.subset] symbols = symbolic.pystr_to_symbolic(lastindex).free_symbols if str(param) not in map(str, symbols): continue # Vectorize connector, if not already vectorized oldtype = connectors[conn] if oldtype is None or oldtype.type is None: oldtype = desc.dtype if isinstance(oldtype, dtypes.vector): continue connectors[conn] = dtypes.vector(oldtype, vector_size) # Modify memlet subset to match vector length if self.strided_map: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb / self.vector_len, rb / self.vector_len, 1) else: newlist[contigidx] = (rb, rb + self.vector_len - 1, 1) else: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb, rb, 1) else: newlist[contigidx] = (self.vector_len * rb, self.vector_len * rb + self.vector_len - 1, 1) edge.data.subset = subsets.Range(newlist) edge.data.volume = vector_size # Vector length propagation using data descriptors, recursive traversal # outwards if self.propagate_parent: for edge in graph.all_edges(tasklet): cursdfg = sdfg curedge = edge while cursdfg is not None: arrname = curedge.data.data dtype = cursdfg.arrays[arrname].dtype # Change type and shape to vector if not isinstance(dtype, dtypes.vector): cursdfg.arrays[arrname].dtype = dtypes.vector( dtype, vector_size) new_shape = list(cursdfg.arrays[arrname].shape) contigidx = cursdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass cursdfg.arrays[arrname].shape = new_shape propagation.propagate_memlets_sdfg(cursdfg) # Find matching edge in parent nsdfg = cursdfg.parent_nsdfg_node if nsdfg is None: break tstate = cursdfg.parent curedge = ([ e for e in tstate.in_edges(nsdfg) if e.dst_conn == arrname ] + [ e for e in tstate.out_edges(nsdfg) if e.src_conn == arrname ])[0] cursdfg = cursdfg.parent_sdfg
def generate_code(sdfg) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. :param sdfg: The SDFG to use :return: List of code objects that correspond to files to compile. """ # Before compiling, validate SDFG correctness sdfg.validate() if Config.get_bool('testing', 'serialization'): from dace.sdfg import SDFG import filecmp sdfg.save('test.sdfg') sdfg2 = SDFG.from_file('test.sdfg') sdfg2.save('test2.sdfg') print('Testing SDFG serialization...') if not filecmp.cmp('test.sdfg', 'test2.sdfg'): raise RuntimeError( 'SDFG serialization failed - files do not match') os.remove('test.sdfg') os.remove('test2.sdfg') # Run with the deserialized version sdfg = sdfg2 # Before generating the code, run type inference on the SDFG connectors infer_connector_types(sdfg) frame = framecode.DaCeCodeGenerator() # Instantiate CPU first (as it is used by the other code generators) # TODO: Refactor the parts used by other code generators out of CPU default_target = cpu.CPUCodeGen for k, v in target.TargetCodeGenerator.extensions().items(): # If another target has already been registered as CPU, use it instead if v['name'] == 'cpu': default_target = k targets = {'cpu': default_target(frame, sdfg)} # Instantiate the rest of the targets targets.update({ v['name']: k(frame, sdfg) for k, v in target.TargetCodeGenerator.extensions().items() if v['name'] not in targets }) # Instantiate all instrumentation providers in SDFG provider_mapping = InstrumentationProvider.get_provider_mapping() frame._dispatcher.instrumentation[ dtypes.InstrumentationType.No_Instrumentation] = None for node, _ in sdfg.all_nodes_recursive(): if hasattr(node, 'instrument'): frame._dispatcher.instrumentation[node.instrument] = \ provider_mapping[node.instrument] elif hasattr(node, 'consume'): frame._dispatcher.instrumentation[node.consume.instrument] = \ provider_mapping[node.consume.instrument] elif hasattr(node, 'map'): frame._dispatcher.instrumentation[node.map.instrument] = \ provider_mapping[node.map.instrument] frame._dispatcher.instrumentation = { k: v() if v is not None else None for k, v in frame._dispatcher.instrumentation.items() } # Generate frame code (and the rest of the code) (global_code, frame_code, used_targets, used_environments) = frame.generate_code(sdfg, None) target_objects = [ CodeObject(sdfg.name, global_code + frame_code, 'cpp', cpu.CPUCodeGen, 'Frame', environments=used_environments) ] # Create code objects for each target for tgt in used_targets: target_objects.extend(tgt.get_generated_codeobjects()) # add a header file for calling the SDFG dummy = CodeObject(sdfg.name, generate_headers(sdfg), 'h', cpu.CPUCodeGen, 'CallHeader', linkable=False) target_objects.append(dummy) # add a dummy main function to show how to call the SDFG dummy = CodeObject(sdfg.name + "_main", generate_dummy(sdfg), 'cpp', cpu.CPUCodeGen, 'DummyMain', linkable=False) target_objects.append(dummy) return target_objects
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): """ Create a map around the BlockReduce node with in and out transients in registers and an if tasklet that redirects the output of thread 0 to a shared memory transient """ ### define some useful vars graph = state reduce_node = node in_edge = graph.in_edges(reduce_node)[0] out_edge = graph.out_edges(reduce_node)[0] axes = reduce_node.axes ### add a map that encloses the reduce node (new_entry, new_exit) = graph.add_map( name = 'inner_reduce_block', ndrange = {'i'+str(i): f'{rng[0]}:{rng[1]+1}:{rng[2]}' \ for (i,rng) in enumerate(in_edge.data.subset) \ if i in axes}, schedule = dtypes.ScheduleType.Default) map = new_entry.map ExpandReduceCUDABlockAll.redirect_edge(graph, in_edge, new_dst=new_entry) ExpandReduceCUDABlockAll.redirect_edge(graph, out_edge, new_src=new_exit) subset_in = subsets.Range([ in_edge.data.subset[i] if i not in axes else (new_entry.map.params[0], new_entry.map.params[0], 1) for i in range(len(in_edge.data.subset)) ]) memlet_in = dace.Memlet(data=in_edge.data.data, volume=1, subset=subset_in) memlet_out = dcpy(out_edge.data) graph.add_edge(u=new_entry, u_connector=None, v=reduce_node, v_connector=None, memlet=memlet_in) graph.add_edge(u=reduce_node, u_connector=None, v=new_exit, v_connector=None, memlet=memlet_out) ### add in and out local storage from dace.transformation.dataflow.local_storage import LocalStorage in_local_storage_subgraph = { LocalStorage.node_a: graph.nodes().index(new_entry), LocalStorage.node_b: graph.nodes().index(reduce_node) } out_local_storage_subgraph = { LocalStorage.node_a: graph.nodes().index(reduce_node), LocalStorage.node_b: graph.nodes().index(new_exit) } local_storage = LocalStorage(sdfg.sdfg_id, sdfg.nodes().index(state), in_local_storage_subgraph, 0) local_storage.array = in_edge.data.data local_storage.apply(sdfg) in_transient = local_storage._data_node sdfg.data(in_transient.data).storage = dtypes.StorageType.Register local_storage = LocalStorage(sdfg.sdfg_id, sdfg.nodes().index(state), out_local_storage_subgraph, 0) local_storage.array = out_edge.data.data local_storage.apply(sdfg) out_transient = local_storage._data_node sdfg.data(out_transient.data).storage = dtypes.StorageType.Register # hack: swap edges as local_storage does not work correctly here # as subsets and data get assigned wrongly (should be swapped) # NOTE: If local_storage ever changes, this will not work any more e1 = graph.in_edges(out_transient)[0] e2 = graph.out_edges(out_transient)[0] e1.data.data = dcpy(e2.data.data) e1.data.subset = dcpy(e2.data.subset) ### add an if tasket and diverge code = 'if ' for (i, param) in enumerate(new_entry.map.params): code += (param + '== 0') if i < len(axes) - 1: code += ' and ' code += ':\n' code += '\tout=inp' tasklet_node = graph.add_tasklet(name='block_reduce_write', inputs=['inp'], outputs=['out'], code=code) edge_out_outtrans = graph.out_edges(out_transient)[0] edge_out_innerexit = graph.out_edges(new_exit)[0] ExpandReduceCUDABlockAll.redirect_edge(graph, edge_out_outtrans, new_dst=tasklet_node, new_dst_conn='inp') e = graph.add_edge(u=tasklet_node, u_connector='out', v=new_exit, v_connector=None, memlet=dcpy(edge_out_innerexit.data)) # set dynamic with volume 0 FORNOW e.data.volume = 0 e.data.dynamic = True ### set reduce_node axes to all (needed) reduce_node.axes = None # fill scope connectors, done. sdfg.fill_scope_connectors() # finally, change the implementation to cuda (block) # itself and expand again. reduce_node.implementation = 'CUDA (block)' sub_expansion = ExpandReduceCUDABlock(0, 0, {}, 0) return sub_expansion.expansion(node=node, state=state, sdfg=sdfg)