class MapTilingWithOverlap(MapTiling): """ Implements the orthogonal tiling transformation with overlap. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. The overlap can vary in each dimension and direction. It is added to each tile and the starting and end points of the outer map are adjusted to account for the overlap. """ # Properties lower_overlap = ShapeProperty(dtype=tuple, default=None, desc="Lower overlap per dimension") upper_overlap = ShapeProperty(dtype=tuple, default=None, desc="Upper overlap per dimension") def apply(self, sdfg): if len(self.lower_overlap) == 0: return if len(self.upper_overlap) == 0: return graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[self.map_entry]] # Tile the map self.tile_trivial = True super().apply(sdfg) tile_map_entry = graph.in_edges(map_entry)[0].src tile_map_exit = graph.exit_node(tile_map_entry) # Introduce overlap for lower_overlap, upper_overlap, param in zip(self.lower_overlap, self.upper_overlap, tile_map_entry.params): pystr = pystr_to_symbolic(param) lower_replace_dict = {pystr: pystr - lower_overlap} upper_replace_dict = {pystr: pystr + upper_overlap} # Extend the range of the inner map map_entry.range.ranges = [ (r[0].subs(lower_replace_dict), r[1].subs(upper_replace_dict), r[2]) for r in map_entry.range.ranges] # Fix the memlets for edge in graph.out_edges(tile_map_entry) + graph.in_edges(tile_map_exit): edge.data.subset.ranges = [ (r[0].subs(lower_replace_dict), r[1].subs(upper_replace_dict), r[2]) for r in edge.data.subset.ranges] # Reduce the range of the tile_map tile_map_entry.range.ranges = [(r[0]+lo, r[1]-uo, r[2]) for r,lo,uo in zip(tile_map_entry.range.ranges, self.lower_overlap, self.upper_overlap)]
class MapDimInterchange(pm.Transformation): """ Implements the map-dimension-interchange pattern. Map-dimension-interchange re-orders the dimensions of a map. """ _map_entry = nodes.MapEntry(None) order = ShapeProperty() @staticmethod def expressions(): return [nxutil.node_path_graph(MapDimInterchange._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): """ A candidate subgraph matches the map-dimension-interchange transformation when a map has at least two dimensions. """ map_entry = graph.nodes()[candidate[MapDimInterchange._map_entry]] return map_entry.map.get_param_num() > 1 @staticmethod def match_to_str(graph, candidate): map_entry = candidate[MapDimInterchange._map_entry] return str(map_entry) def apply(self, sdfg): """ Reorders the dimensions of the map by reordering the parameters and the range of the map as specified through the properties. """ # Extract the map and its entry node. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapDimInterchange._map_entry]] current_map = map_entry.map order = self.order if len(self.order) != current_map.get_param_num(): # 'order' must be of the same length as the number of map # dimensions. return # Re-order the map dimensions current_map.params = [current_map.params[idx] for idx in order] current_map.range.reorder(order) return def __init__(self, *args, **kwargs): self.entry = nodes.EntryNode() self.tasklet = nodes.Tasklet('_') self.exit = nodes.ExitNode() self.pairs = None super().__init__(*args, **kwargs) def modifies_graph(self): return True
class MapDimShuffle(transformation.Transformation): """ Implements the map-dim shuffle transformation. MapDimShuffle takes a map and a list of params. It reorders the dimensions in the map such that it matches the list. """ _map_entry = transformation.PatternNode(nodes.MapEntry) # Properties parameters = ShapeProperty(dtype=list, default=None, desc="Desired order of map parameters") @staticmethod def expressions(): return [sdutil.node_path_graph(MapDimShuffle._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapDimShuffle._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[self._map_entry]] if set(self.parameters) != set(map_entry.map.params): return map_entry.range.ranges = [ r for list_param in self.parameters for map_param, r in zip( map_entry.map.params, map_entry.range.ranges) if list_param == map_param ] map_entry.map.params = self.parameters
class MapTiling(pattern_matching.Transformation): """ Implements the orthogonal tiling transformation. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties prefix = Property(dtype=str, default="tile", desc="Prefix for new range symbols") tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") strides = ShapeProperty( dtype=tuple, default=tuple(), desc="Tile stride (enables overlapping tiles). If empty, matches tile") divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [nxutil.node_path_graph(MapTiling._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapTiling._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tile_strides = self.tile_sizes if self.strides is not None and len(self.strides) == len(tile_strides): tile_strides = self.strides # Retrieve map entry and exit nodes. map_entry = graph.nodes()[self.subgraph[MapTiling._map_entry]] from dace.transformation.dataflow.map_collapse import MapCollapse from dace.transformation.dataflow.strip_mining import StripMining stripmine_subgraph = { StripMining._map_entry: self.subgraph[MapTiling._map_entry] } sdfg_id = sdfg.sdfg_list.index(sdfg) last_map_entry = None removed_maps = 0 original_schedule = map_entry.schedule for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(self.tile_sizes): tile_size = symbolic.pystr_to_symbolic(self.tile_sizes[-1]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[-1]) else: tile_size = symbolic.pystr_to_symbolic( self.tile_sizes[dim_idx]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[dim_idx]) dim_idx -= removed_maps # If tile size is trivial, skip strip-mining map dimension if tile_size == map_entry.map.range.size()[dim_idx]: continue stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) # Special case: Tile size of 1 should be omitted from inner map if tile_size == 1 and tile_stride == 1: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = '' stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.apply(sdfg) removed_maps += 1 else: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.apply(sdfg) # apply to the new map the schedule of the original one map_entry.schedule = original_schedule if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src
class Data(object): """ Data type descriptors that can be used as references to memory. Examples: Arrays, Streams, custom arrays (e.g., sparse matrices). """ dtype = TypeClassProperty(default=dtypes.int32, choices=dtypes.Typeclasses) shape = ShapeProperty(default=[]) transient = Property(dtype=bool, default=False) storage = EnumProperty(dtype=dtypes.StorageType, desc="Storage location", default=dtypes.StorageType.Default) lifetime = EnumProperty(dtype=dtypes.AllocationLifetime, desc='Data allocation span', default=dtypes.AllocationLifetime.Scope) location = DictProperty( key_type=str, value_type=symbolic.pystr_to_symbolic, desc='Full storage location identifier (e.g., rank, GPU ID)') debuginfo = DebugInfoProperty(allow_none=True) def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo): self.dtype = dtype self.shape = shape self.transient = transient self.storage = storage self.location = location if location is not None else {} self.lifetime = lifetime self.debuginfo = debuginfo self._validate() def validate(self): """ Validate the correctness of this object. Raises an exception on error. """ self._validate() # Validation of this class is in a separate function, so that this # class can call `_validate()` without calling the subclasses' # `validate` function. def _validate(self): if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError('Shape must be a list or tuple of integer values ' 'or symbols') return True def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @property def toplevel(self): return self.lifetime is not dtypes.AllocationLifetime.Scope def copy(self): raise RuntimeError( 'Data descriptors are unique and should not be copied') def is_equivalent(self, other): """ Check for equivalence (shape and type) of two data descriptors. """ raise NotImplementedError def as_arg(self, with_types=True, for_call=False, name=None): """Returns a string for a C++ function signature (e.g., `int *A`). """ raise NotImplementedError @property def free_symbols(self) -> Set[symbolic.SymbolicType]: """ Returns a set of undefined symbols in this data descriptor. """ result = set() for s in self.shape: if isinstance(s, sp.Basic): result |= set(s.free_symbols) return result def __repr__(self): return 'Abstract Data Container, DO NOT USE' @property def veclen(self): return self.dtype.veclen if hasattr(self.dtype, "veclen") else 1 @property def ctype(self): return self.dtype.ctype
class Array(Data): """ Array/constant descriptor (dimensions, type and other properties). """ # Properties allow_conflicts = Property( dtype=bool, default=False, desc='If enabled, allows more than one ' 'memlet to write to the same memory location without conflict ' 'resolution.') strides = ShapeProperty( # element_type=symbolic.pystr_to_symbolic, desc='For each dimension, the number of elements to ' 'skip in order to obtain the next element in ' 'that dimension.') total_size = SymbolicProperty( default=1, desc='The total allocated size of the array. Can be used for' ' padding.') offset = ListProperty(element_type=symbolic.pystr_to_symbolic, desc='Initial offset to translate all indices by.') may_alias = Property(dtype=bool, default=False, desc='This pointer may alias with other pointers in ' 'the same function') alignment = Property(dtype=int, default=0, desc='Allocation alignment in bytes (0 uses ' 'compiler-default)') def __init__(self, dtype, shape, transient=False, allow_conflicts=False, storage=dtypes.StorageType.Default, location=None, strides=None, offset=None, may_alias=False, lifetime=dtypes.AllocationLifetime.Scope, alignment=0, debuginfo=None, total_size=None): super(Array, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) if shape is None: raise IndexError('Shape must not be None') self.allow_conflicts = allow_conflicts self.may_alias = may_alias self.alignment = alignment if strides is not None: self.strides = cp.copy(strides) else: self.strides = [_prod(shape[i + 1:]) for i in range(len(shape))] self.total_size = total_size or _prod(shape) if offset is not None: self.offset = cp.copy(offset) else: self.offset = [0] * len(shape) self.validate() def __repr__(self): return '%s (dtype=%s, shape=%s)' % (type(self).__name__, self.dtype, self.shape) def clone(self): return type(self)(self.dtype, self.shape, self.transient, self.allow_conflicts, self.storage, self.location, self.strides, self.offset, self.may_alias, self.lifetime, self.alignment, self.debuginfo, self.total_size) def to_json(self): attrs = serialize.all_properties_to_json(self) # Take care of symbolic expressions attrs['strides'] = list(map(str, attrs['strides'])) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @classmethod def from_json(cls, json_obj, context=None): # Create dummy object ret = cls(dtypes.int8, ()) serialize.set_properties_from_json(ret, json_obj, context=context) # TODO: This needs to be reworked (i.e. integrated into the list property) ret.strides = list(map(symbolic.pystr_to_symbolic, ret.strides)) # Check validity now ret.validate() return ret def validate(self): super(Array, self).validate() if len(self.strides) != len(self.shape): raise TypeError('Strides must be the same size as shape') if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides): raise TypeError('Strides must be a list or tuple of integer ' 'values or symbols') if len(self.offset) != len(self.shape): raise TypeError('Offset must be the same size as shape') def covers_range(self, rng): if len(rng) != len(self.shape): return False for s, (rb, re, rs) in zip(self.shape, rng): # Shape has to be positive if isinstance(s, sp.Basic): olds = s if 'positive' in s.assumptions0: s = sp.Symbol(str(s), **s.assumptions0) else: s = sp.Symbol(str(s), positive=True, **s.assumptions0) if isinstance(rb, sp.Basic): rb = rb.subs({olds: s}) if isinstance(re, sp.Basic): re = re.subs({olds: s}) if isinstance(rs, sp.Basic): rs = rs.subs({olds: s}) try: if rb < 0: # Negative offset return False except TypeError: # cannot determine truth value of Relational pass #print('WARNING: Cannot evaluate relational expression %s, assuming true.' % (rb > 0), # 'If this expression is false, please refine symbol definitions in the program.') try: if re > s: # Beyond shape return False except TypeError: # cannot determine truth value of Relational pass #print('WARNING: Cannot evaluate relational expression %s, assuming true.' % (re < s), # 'If this expression is false, please refine symbol definitions in the program.') return True # Checks for equivalent shape and type def is_equivalent(self, other): if not isinstance(other, type(self)): return False # Test type if self.dtype != other.dtype: return False # Test dimensionality if len(self.shape) != len(other.shape): return False # Test shape for dim, otherdim in zip(self.shape, other.shape): # Any other case (constant vs. constant), check for equality if otherdim != dim: return False return True def as_arg(self, with_types=True, for_call=False, name=None): arrname = name if not with_types or for_call: return arrname if self.may_alias: return str(self.dtype.ctype) + ' *' + arrname return str(self.dtype.ctype) + ' * __restrict__ ' + arrname def sizes(self): return [ d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape ] @property def free_symbols(self): result = super().free_symbols for s in self.strides: if isinstance(s, sp.Expr): result |= set(s.free_symbols) if isinstance(self.total_size, sp.Expr): result |= set(self.total_size.free_symbols) for o in self.offset: if isinstance(o, sp.Expr): result |= set(o.free_symbols) return result
class OrthogonalTiling(pattern_matching.Transformation): """ Implements the orthogonal tiling transformation. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties prefix = Property( dtype=str, default="tile", desc="Prefix for new iterators") tile_sizes = ShapeProperty( dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") divides_evenly = Property( dtype=bool, default=False, desc="Tile size divides dimension length evenly") @staticmethod def annotates_memlets(): return False @staticmethod def expressions(): return [nxutil.node_path_graph(OrthogonalTiling._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[OrthogonalTiling._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] # Tile map. target_dim, new_dim, new_map = self.__stripmine( sdfg, graph, self.subgraph) return new_map def __stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[OrthogonalTiling._map_entry]] map_exit = graph.exit_nodes(map_entry)[0] # Map subgraph map_subgraph = graph.scope_subgraph(map_entry) # Retrieve transformation properties. prefix = self.prefix tile_sizes = self.tile_sizes divides_evenly = self.divides_evenly new_param = [] new_range = [] for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(tile_sizes): tile_size = tile_sizes[-1] else: tile_size = tile_sizes[dim_idx] # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] new_dim = prefix + '_' + target_dim # Basic values if divides_evenly: tile_num = '(%s + 1 - %s) / %s' % (symbolic.symstr(td_to), symbolic.symstr(td_from), str(tile_size)) else: tile_num = 'int_ceil((%s + 1 - %s), %s)' % (symbolic.symstr( td_to), symbolic.symstr(td_from), str(tile_size)) # Outer map values (over all tiles) nd_from = 0 nd_to = symbolic.pystr_to_symbolic(str(tile_num) + ' - 1') nd_step = 1 # Inner map values (over one tile) td_from_new = dace.symbolic.pystr_to_symbolic(td_from) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1 - %s * %s, %s + %s) - 1' % (symbolic.symstr(td_to), str(new_dim), str(tile_size), td_from_new, str(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s - 1' % (td_from_new, str(tile_size))) # Outer map (over all tiles) new_dim_range = (nd_from, nd_to, nd_step) new_param.append(new_dim) new_range.append(new_dim_range) # Inner map (over one tile) if divides_evenly: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) map_entry.map.range[dim_idx] = (td_from_new, td_to_new, td_step) # Fix subgraph memlets target_dim = dace.symbolic.pystr_to_symbolic(target_dim) offset = dace.symbolic.pystr_to_symbolic( '%s * %s' % (new_dim, str(tile_size))) for _, _, _, _, memlet in map_subgraph.edges(): old_subset = memlet.subset if isinstance(old_subset, dace.subsets.Indices): new_indices = [] for idx in old_subset: new_idx = idx.subs(target_dim, target_dim + offset) new_indices.append(new_idx) memlet.subset = dace.subsets.Indices(new_indices) elif isinstance(old_subset, dace.subsets.Range): new_ranges = [] for i, old_range in enumerate(old_subset): if len(old_range) == 3: b, e, s, = old_range t = old_subset.tile_sizes[i] else: raise ValueError( 'Range %s is invalid.' % old_range) new_b = b.subs(target_dim, target_dim + offset) new_e = e.subs(target_dim, target_dim + offset) new_s = s.subs(target_dim, target_dim + offset) new_t = t.subs(target_dim, target_dim + offset) new_ranges.append((new_b, new_e, new_s, new_t)) memlet.subset = dace.subsets.Range(new_ranges) else: raise NotImplementedError new_map = nodes.Map(prefix + '_' + map_entry.map.label, new_param, subsets.Range(new_range)) new_map_entry = nodes.MapEntry(new_map) new_exit = nodes.MapExit(new_map) # Make internal map's schedule to "not parallel" map_entry.map._schedule = dtypes.ScheduleType.Default # Redirect/create edges. new_in_edges = {} for _src, conn, _dest, _, memlet in graph.out_edges(map_entry): if not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar): new_subset = copy.deepcopy(memlet.subset) # new_subset = calc_set_image(map_entry.map.params, # map_entry.map.range, memlet.subset, # cont_or_strided) if memlet.data in new_in_edges: src, src_conn, dest, dest_conn, new_memlet, num = \ new_in_edges[memlet.data] new_memlet.subset = calc_set_union( new_memlet.data, sdfg.arrays[nnew_memlet.data], new_memlet.subset, new_subset) new_memlet.num_accesses = new_memlet.num_elements() new_in_edges.update({ memlet.data: (src, src_conn, dest, dest_conn, new_memlet, min(num, int(conn[4:]))) }) else: new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_in_edges.update({ memlet.data: (new_map_entry, None, map_entry, None, new_memlet, int(conn[4:])) }) nxutil.change_edge_dest(graph, map_entry, new_map_entry) new_out_edges = {} for _src, conn, _dest, _, memlet in graph.in_edges(map_exit): if not isinstance(sdfg.arrays[memlet.data], dace.data.Scalar): new_subset = memlet.subset # new_subset = calc_set_image(map_entry.map.params, # map_entry.map.range, # memlet.subset, cont_or_strided) if memlet.data in new_out_edges: src, src_conn, dest, dest_conn, new_memlet, num = \ new_out_edges[memlet.data] new_memlet.subset = calc_set_union( new_memlet.data, sdfg.arrays[nnew_memlet.data], new_memlet.subset, new_subset) new_memlet.num_accesses = new_memlet.num_elements() new_out_edges.update({ memlet.data: (src, src_conn, dest, dest_conn, new_memlet, min(num, conn[4:])) }) else: new_memlet = dcpy(memlet) new_memlet.subset = new_subset new_memlet.num_accesses = new_memlet.num_elements() new_out_edges.update({ memlet.data: (map_exit, None, new_exit, None, new_memlet, conn[4:]) }) nxutil.change_edge_src(graph, map_exit, new_exit) # Connector related work follows # 1. Dictionary 'old_connector_number': 'new_connector_numer' # 2. New node in/out connectors # 3. New edges in_conn_nums = [] for _, e in new_in_edges.items(): _, _, _, _, _, num = e in_conn_nums.append(num) in_conn = {} for i, num in enumerate(in_conn_nums): in_conn.update({num: i + 1}) entry_in_connectors = set() entry_out_connectors = set() for i in range(len(in_conn_nums)): entry_in_connectors.add('IN_' + str(i + 1)) entry_out_connectors.add('OUT_' + str(i + 1)) new_map_entry.in_connectors = entry_in_connectors new_map_entry.out_connectors = entry_out_connectors for _, e in new_in_edges.items(): src, _, dst, _, memlet, num = e graph.add_edge(src, 'OUT_' + str(in_conn[num]), dst, 'IN_' + str(in_conn[num]), memlet) out_conn_nums = [] for _, e in new_out_edges.items(): _, _, dst, _, _, num = e if dst is not new_exit: continue out_conn_nums.append(num) out_conn = {} for i, num in enumerate(out_conn_nums): out_conn.update({num: i + 1}) exit_in_connectors = set() exit_out_connectors = set() for i in range(len(out_conn_nums)): exit_in_connectors.add('IN_' + str(i + 1)) exit_out_connectors.add('OUT_' + str(i + 1)) new_exit.in_connectors = exit_in_connectors new_exit.out_connectors = exit_out_connectors for _, e in new_out_edges.items(): src, _, dst, _, memlet, num = e graph.add_edge(src, 'OUT_' + str(out_conn[num]), dst, 'IN_' + str(out_conn[num]), memlet) # Return strip-mined dimension. return target_dim, new_dim, new_map @staticmethod def __modify_edges(sdfg, graph, candidate, target_dim, new_dim): map_entry = graph.nodes()[candidate[OrthogonalTiling._map_entry]] processed = [] for src, _dest, memlet, _scope in nxutil.traverse_sdfg_scope( graph, map_entry, True): if memlet in processed: continue processed.append(memlet) # Corner cases if isinstance(sdfg.arrays[memlet.data], dace.data.Stream): continue if memlet.wcr is not None: memlet.num_accesses = 1 continue for i, dim in enumerate(memlet.subset): if isinstance(dim, tuple): dim = tuple( symbolic.pystr_to_symbolic(d).subs( symbolic.pystr_to_symbolic(target_dim), symbolic.pystr_to_symbolic( '%s + %s' % (str(new_dim), str(target_dim)))) for d in dim) else: dim = symbolic.pystr_to_symbolic(dim).subs( symbolic.pystr_to_symbolic(target_dim), symbolic.pystr_to_symbolic( '%s + %s' % (str(new_dim), str(target_dim)))) memlet.subset[i] = dim return
def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] expr_index = self.expr_index graph = sdfg.nodes()[self.state_id] tasklet = gnode(MapReduceFusion._tasklet) tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] elif expr_index == 1: # Reduce with outer map rmap_out_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_entry]] rmap_out_exit = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_exit]] rmap_in_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] rmap_tasklet = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_tasklet]] if expr_index == 2: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] else: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._rmap_in_cr]] out_array = gnode(MapReduceFusion._out_array) # Set nodes to remove according to the expression index nodes_to_remove = [in_array] if expr_index == 0: nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) elif expr_index == 1: nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_exit)) else: nodes_to_remove.append(gnode(MapReduceFusion._reduce)) # If no other edges lead to mapexit, remove it. Otherwise, keep # it and remove reduction incoming/outgoing edges if expr_index != 2 and len(graph.in_edges(tmap_exit)) == 1: nodes_to_remove.append(tmap_exit) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') if expr_index == 0: # Reduce without outer map # Index order does not matter, merge as-is pass elif expr_index == 1: # Reduce with outer map tmap = tmap_exit.map perm_outer, perm_inner = MapReduceFusion.find_permutation( tmap, rmap_out_entry.map, rmap_in_entry.map, memlet_edge.data) # Split tasklet map into tmap_out -> tmap_in (according to # reduction) omap = nodes.Map( tmap.label + '_nonreduce', [p for i, p in enumerate(tmap.params) if i in perm_outer], [r for i, r in enumerate(tmap.range) if i in perm_outer], tmap.schedule, tmap.unroll, tmap.is_async) tmap.params = [ p for i, p in enumerate(tmap.params) if i in perm_inner ] tmap.range = [ r for i, r in enumerate(tmap.range) if i in perm_inner ] omap_entry = nodes.MapEntry(omap) omap_exit = rmap_out_exit rmap_out_exit.map = omap # Reconnect graph to new map tmap_entry = graph.entry_node(tmap_exit) tmap_in_edges = list(graph.in_edges(tmap_entry)) for e in tmap_in_edges: nxutil.change_edge_dest(graph, tmap_entry, omap_entry) for e in tmap_in_edges: graph.add_edge(omap_entry, e.src_conn, tmap_entry, e.dst_conn, copy.copy(e.data)) elif expr_index == 2: # Reduce node # Find correspondence between map indices and array outputs tmap = tmap_exit.map perm = MapReduceFusion.find_permutation_reduce( tmap, rmap_cr, graph, memlet_edge.data) output_subset = [tmap.params[d] for d in perm] if len(output_subset) == 0: # Output is a scalar output_subset = [0] array_edge = graph.out_edges(rmap_cr)[0] # Delete relevant edges and nodes graph.remove_edge(memlet_edge) graph.remove_nodes_from(nodes_to_remove) # Add new edges and nodes # From tasklet to map exit graph.add_edge( memlet_edge.src, memlet_edge.src_conn, memlet_edge.dst, memlet_edge.dst_conn, Memlet(out_array.data, memlet_edge.data.num_accesses, subsets.Indices(output_subset), memlet_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) # From map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) return # Remove tmp array node prior to the others, so that a new one # can be created in its stead (see below) graph.remove_node(nodes_to_remove[0]) nodes_to_remove = nodes_to_remove[1:] # Create tasklet -> tmp -> tasklet connection tmp = graph.add_array( 'tmp', memlet_edge.data.subset.bounding_box_size(), sdfg.arrays[memlet_edge.data.data].dtype, transient=True) tasklet_tmp_memlet = copy.deepcopy(memlet_edge.data) tasklet_tmp_memlet.data = tmp.data tasklet_tmp_memlet.subset = ShapeProperty.to_string(tmp.shape) # Modify memlet to point to output array memlet_edge.data.data = out_array.data # Recover reduction axes from CR reduce subset reduce_cr_subset = graph.in_edges(rmap_tasklet)[0].data.subset reduce_axes = [] for ind, crvar in enumerate(reduce_cr_subset.indices): if '__i' in str(crvar): reduce_axes.append(ind) # Modify memlet access index by filtering out reduction axes if True: # expr_index == 0: newindices = [] for ind, ovar in enumerate(memlet_edge.data.subset.indices): if ind not in reduce_axes: newindices.append(ovar) if len(newindices) == 0: newindices = [0] memlet_edge.data.subset = subsets.Indices(newindices) graph.remove_edge(memlet_edge) graph.add_edge(memlet_edge.src, memlet_edge.src_conn, tmp, memlet_edge.dst_conn, tasklet_tmp_memlet) red_edges = list(graph.in_edges(rmap_tasklet)) if len(red_edges) != 1: raise RuntimeError('CR edge must be unique') tmp_tasklet_memlet = copy.deepcopy(tasklet_tmp_memlet) graph.add_edge(tmp, None, rmap_tasklet, red_edges[0].dst_conn, tmp_tasklet_memlet) for e in graph.edges_between(rmap_tasklet, rmap_cr): e.data.subset = memlet_edge.data.subset # Move output edges to point directly to CR node if expr_index == 1: # Set output memlet between CR node and outer reduction map to # contain the same subset as the one pointing to the CR node for e in graph.out_edges(rmap_cr): e.data.subset = memlet_edge.data.subset rmap_out = gnode(MapReduceFusion._rmap_out_exit) nxutil.change_edge_src(graph, rmap_out, omap_exit) # Remove nodes graph.remove_nodes_from(nodes_to_remove) # For unrelated outputs, connect original output to rmap_out if expr_index == 1 and tmap_exit not in nodes_to_remove: other_out_edges = list(graph.out_edges(tmap_exit)) for e in other_out_edges: graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, omap_exit, None, e.data) graph.add_edge(omap_exit, None, e.dst, e.dst_conn, copy.copy(e.data))
class SubArray(object): """ Sub-arrays describe subsets of Arrays (see `dace::data::Array`) for purposes of distributed communication. They are implemented with [MPI_Type_create_subarray](https://www.mpich.org/static/docs/v3.2/www3/MPI_Type_create_subarray.html). Sub-arrays can be also used for collective scatter/gather communication in a process-grid. The `shape`, `subshape`, and `dtype` properties correspond to the `array_of_sizes`, `array_of_subsizes`, and `oldtype` parameters of `MPI_Type_create_subarray`. The following properties are used for collective scatter/gather communication in a process-grid: The `pgrid` property is the name of the process-grid where the data will be distributed. The `correspondence` property matches the arrays dimensions to the process-grid's dimensions. For example, if one wants to distribute a matrix to a 2D process-grid, but tile the matrix rows over the grid's columns, then `correspondence = [1, 0]`. """ name = Property(dtype=str, desc="The type's name.") dtype = TypeClassProperty(default=dtypes.int32, choices=dtypes.Typeclasses) shape = ShapeProperty(default=[], desc="The array's shape.") subshape = ShapeProperty(default=[], desc="The sub-array's shape.") pgrid = Property( dtype=str, allow_none=True, default=None, desc="Name of the process-grid where the data are distributed.") correspondence = ListProperty( int, allow_none=True, default=None, desc="Correspondence of the array's indices to the process grid's " "indices.") def __init__(self, name: str, dtype: dtypes.typeclass, shape: ShapeType, subshape: ShapeType, pgrid: str = None, correspondence: Sequence[Integral] = None): self.name = name self.dtype = dtype self.shape = shape self.subshape = subshape self.pgrid = pgrid self.correspondence = correspondence or list(range(len(shape))) self._validate() def validate(self): """ Validate the correctness of this object. Raises an exception on error. """ self._validate() # Validation of this class is in a separate function, so that this # class can call `_validate()` without calling the subclasses' # `validate` function. def _validate(self): if any(not isinstance(s, (Integral, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError( 'Shape must be a list or tuple of integer values or symbols') if any(not isinstance(s, (Integral, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.subshape): raise TypeError( 'Sub-shape must be a list or tuple of integer values or symbols' ) if any(not isinstance(i, Integral) for i in self.correspondence): raise TypeError( 'Correspondence must be a list or tuple of integer values') if len(self.shape) != len(self.subshape): raise ValueError( 'The dimensionality of the shape and sub-shape must match') if len(self.correspondence) != len(self.shape): raise ValueError( 'The dimensionality of the shape and correspondence list must match' ) return True def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @classmethod def from_json(cls, json_obj, context=None): # Create dummy object ret = cls('tmp', dtypes.int8, [], [], 'tmp', []) serialize.set_properties_from_json(ret, json_obj, context=context) # Check validity now ret.validate() return ret def init_code(self): """ Outputs MPI allocation/initialization code for the sub-array MPI datatype ONLY if the process-grid is set. It is assumed that the following variables exist in the SDFG program's state: - MPI_Datatype {self.name} - int* {self.name}_counts - int* {self.name}_displs These variables are typically added to the program's state through a Tasklet, e.g., the Dummy MPI node (for more details, check the DaCe MPI library in `dace/libraries/mpi`). """ from dace.libraries.mpi import utils if self.pgrid: return f""" if (__state->{self.pgrid}_valid) {{ int sizes[{len(self.shape)}] = {{{', '.join([str(s) for s in self.shape])}}}; int subsizes[{len(self.shape)}] = {{{', '.join([str(s) for s in self.subshape])}}}; int corr[{len(self.shape)}] = {{{', '.join([str(i) for i in self.correspondence])}}}; int basic_stride = subsizes[{len(self.shape)} - 1]; int process_strides[{len(self.shape)}]; int block_strides[{len(self.shape)}]; int data_strides[{len(self.shape)}]; process_strides[{len(self.shape)} - 1] = 1; block_strides[{len(self.shape)} - 1] = subsizes[{len(self.shape)} - 1]; data_strides[{len(self.shape)} - 1] = 1; for (auto i = {len(self.shape)} - 2; i >= 0; --i) {{ block_strides[i] = block_strides[i+1] * subsizes[i]; process_strides[i] = process_strides[i+1] * __state->{self.pgrid}_dims[corr[i+1]]; data_strides[i] = block_strides[i] * process_strides[i] / basic_stride; }} MPI_Datatype type; int origin[{len(self.shape)}] = {{{','.join(['0'] * len(self.shape))}}}; MPI_Type_create_subarray({len(self.shape)}, sizes, subsizes, origin, MPI_ORDER_C, {utils.MPI_DDT(self.dtype.base_type)}, &type); MPI_Type_create_resized(type, 0, basic_stride*sizeof({self.dtype.ctype}), &__state->{self.name}); MPI_Type_commit(&__state->{self.name}); __state->{self.name}_counts = new int[__state->{self.pgrid}_size]; __state->{self.name}_displs = new int[__state->{self.pgrid}_size]; int block_id[{len(self.shape)}] = {{0}}; int displ = 0; for (auto i = 0; i < __state->{self.pgrid}_size; ++i) {{ __state->{self.name}_counts[i] = 1; __state->{self.name}_displs[i] = displ; int idx = {len(self.shape)} - 1; while (idx >= 0 && block_id[idx] + 1 >= __state->{self.pgrid}_dims[corr[idx]]) {{ block_id[idx] = 0; displ -= data_strides[idx] * (__state->{self.pgrid}_dims[corr[idx]] - 1); idx--; }} if (idx >= 0) {{ block_id[idx] += 1; displ += data_strides[idx]; }} else {{ assert(i == __state->{self.pgrid}_size - 1); }} }} }} """ else: return "" def exit_code(self): """ Outputs MPI deallocation code for the sub-array MPI datatype ONLY if the process-grid is set. """ if self.pgrid: return f""" if (__state->{self.pgrid}_valid) {{ delete[] __state->{self.name}_counts; delete[] __state->{self.name}_displs; MPI_Type_free(&__state->{self.name}); }} """ else: return ""
class MapTiling(transformation.SingleStateTransformation): """ Implements the orthogonal tiling transformation. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. """ map_entry = transformation.PatternNode(nodes.MapEntry) # Properties prefix = Property(dtype=str, default="tile", desc="Prefix for new range symbols") tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") strides = ShapeProperty( dtype=tuple, default=tuple(), desc="Tile stride (enables overlapping tiles). If empty, matches tile") tile_offset = ShapeProperty(dtype=tuple, default=None, desc="Negative Stride offset per dimension", allow_none=True) divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1") @staticmethod def annotates_memlets(): return True @classmethod def expressions(cls): return [sdutil.node_path_graph(cls.map_entry)] def can_be_applied(self, graph, expr_index, sdfg, permissive=False): return True def apply(self, graph: SDFGState, sdfg: SDFG): tile_strides = self.tile_sizes if self.strides is not None and len(self.strides) == len(tile_strides): tile_strides = self.strides # Retrieve map entry and exit nodes. map_entry = self.map_entry from dace.transformation.dataflow.map_collapse import MapCollapse from dace.transformation.dataflow.strip_mining import StripMining stripmine_subgraph = { StripMining.map_entry: self.subgraph[MapTiling.map_entry] } sdfg_id = sdfg.sdfg_id last_map_entry = None removed_maps = 0 original_schedule = map_entry.schedule for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(self.tile_sizes): tile_size = symbolic.pystr_to_symbolic(self.tile_sizes[-1]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[-1]) else: tile_size = symbolic.pystr_to_symbolic( self.tile_sizes[dim_idx]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[dim_idx]) # handle offsets if self.tile_offset and dim_idx >= len(self.tile_offset): offset = self.tile_offset[-1] elif self.tile_offset: offset = self.tile_offset[dim_idx] else: offset = 0 dim_idx -= removed_maps # If tile size is trivial, skip strip-mining map dimension if tile_size == map_entry.map.range.size()[dim_idx]: continue stripmine = StripMining(sdfg, sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) # Special case: Tile size of 1 should be omitted from inner map if tile_size == 1 and tile_stride == 1 and self.tile_trivial == False: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = '' stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) stripmine.apply(graph, sdfg) removed_maps += 1 else: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) stripmine.apply(graph, sdfg) # apply to the new map the schedule of the original one map_entry.schedule = original_schedule if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse.outer_map_entry: graph.node_id(last_map_entry), MapCollapse.inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg, sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(graph, sdfg) last_map_entry = graph.in_edges(map_entry)[0].src return last_map_entry
class Array(Data): """ Array data descriptor. This object represents a multi-dimensional data container in SDFGs that can be accessed and modified. The definition does not contain the actual array, but rather a description of how to construct it and how it should behave. The array definition is flexible in terms of data allocation, it allows arbitrary multidimensional, potentially symbolic shapes (e.g., an array with size ``N+1 x M`` will have ``shape=(N+1, M)``), of arbitrary data typeclasses (``dtype``). The physical data layout of the array is controlled by several properties: * The ``strides`` property determines the ordering and layout of the dimensions --- it specifies how many elements in memory are skipped whenever one element in that dimension is advanced. For example, the contiguous dimension always has a stride of ``1``; a C-style MxN array will have strides ``(N, 1)``, whereas a FORTRAN-style array of the same size will have ``(1, M)``. Strides can be larger than the shape, which allows post-padding of the contents of each dimension. * The ``start_offset`` property is a number of elements to pad the beginning of the memory buffer with. This is used to ensure that a specific index is aligned as a form of pre-padding (that element may not necessarily be the first element, e.g., in the case of halo or "ghost cells" in stencils). * The ``total_size`` property determines how large the total allocation size is. Normally, it is the product of the ``shape`` elements, but if pre- or post-padding is involved it may be larger. * ``alignment`` provides alignment guarantees (in bytes) of the first element in the allocated array. This is used by allocators in the code generator to ensure certain addresses are expected to be aligned, e.g., for vectorization. * Lastly, a property called ``offset`` controls the logical access of the array, i.e., what would be the first element's index after padding and alignment. This mimics a language feature prominent in scientific languages such as FORTRAN, where one could set an array to begin with 1, or any arbitrary index. By default this is set to zero. To summarize with an example, a two-dimensional array with pre- and post-padding looks as follows: .. code-block:: text [xxx][ |xx] [ |xx] [ |xx] [ |xx] --------------- [xxxxxxxxxxxxx] shape = (4, 10) strides = (12, 1) start_offset = 3 total_size = 63 [= 3 + 12 * 5] offset = (0, 0, 0) Notice that the last padded row does not appear in strides, but is a consequence of ``total_size`` being larger. Apart from memory layout, other properties of ``Array`` help the data-centric transformation infrastructure make decisions about the array. ``allow_conflicts`` states that warnings should not be printed if potential conflicted acceses (e.g., data races) occur. ``may_alias`` inhibits transformations that may assume that this array does not overlap with other arrays in the same context (e.g., function). """ # Properties allow_conflicts = Property(dtype=bool, default=False, desc='If enabled, allows more than one ' 'memlet to write to the same memory location without conflict ' 'resolution.') strides = ShapeProperty( # element_type=symbolic.pystr_to_symbolic, desc='For each dimension, the number of elements to ' 'skip in order to obtain the next element in ' 'that dimension.') total_size = SymbolicProperty(default=0, desc='The total allocated size of the array. Can be used for padding.') offset = ShapeProperty(desc='Initial offset to translate all indices by.') may_alias = Property(dtype=bool, default=False, desc='This pointer may alias with other pointers in the same function') alignment = Property(dtype=int, default=0, desc='Allocation alignment in bytes (0 uses compiler-default)') start_offset = Property(dtype=int, default=0, desc='Allocation offset elements for manual alignment (pre-padding)') def __init__(self, dtype, shape, transient=False, allow_conflicts=False, storage=dtypes.StorageType.Default, location=None, strides=None, offset=None, may_alias=False, lifetime=dtypes.AllocationLifetime.Scope, alignment=0, debuginfo=None, total_size=None, start_offset=None): super(Array, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) if shape is None: raise IndexError('Shape must not be None') self.allow_conflicts = allow_conflicts self.may_alias = may_alias self.alignment = alignment if start_offset is not None: self.start_offset = start_offset if strides is not None: self.strides = cp.copy(strides) else: self.strides = [_prod(shape[i + 1:]) for i in range(len(shape))] if strides is not None and shape is not None and total_size is None: # Compute the minimal total_size that could be used with strides and shape self.total_size = sum(((shp - 1) * s for shp, s in zip(shape, strides))) + 1 else: self.total_size = total_size or _prod(shape) if offset is not None: self.offset = cp.copy(offset) else: self.offset = [0] * len(shape) self.validate() def __repr__(self): return '%s (dtype=%s, shape=%s)' % (type(self).__name__, self.dtype, self.shape) def clone(self): return type(self)(self.dtype, self.shape, self.transient, self.allow_conflicts, self.storage, self.location, self.strides, self.offset, self.may_alias, self.lifetime, self.alignment, self.debuginfo, self.total_size, self.start_offset) def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @classmethod def from_json(cls, json_obj, context=None): # Create dummy object ret = cls(dtypes.int8, ()) serialize.set_properties_from_json(ret, json_obj, context=context) # Default shape-related properties if not ret.offset: ret.offset = [0] * len(ret.shape) if not ret.strides: # Default strides are C-ordered ret.strides = [_prod(ret.shape[i + 1:]) for i in range(len(ret.shape))] if ret.total_size == 0: ret.total_size = _prod(ret.shape) # Check validity now ret.validate() return ret def validate(self): super(Array, self).validate() if len(self.strides) != len(self.shape): raise TypeError('Strides must be the same size as shape') if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides): raise TypeError('Strides must be a list or tuple of integer ' 'values or symbols') if len(self.offset) != len(self.shape): raise TypeError('Offset must be the same size as shape') def covers_range(self, rng): if len(rng) != len(self.shape): return False for s, (rb, re, rs) in zip(self.shape, rng): # Shape has to be positive if isinstance(s, sp.Basic): olds = s if 'positive' in s.assumptions0: s = sp.Symbol(str(s), **s.assumptions0) else: s = sp.Symbol(str(s), positive=True, **s.assumptions0) if isinstance(rb, sp.Basic): rb = rb.subs({olds: s}) if isinstance(re, sp.Basic): re = re.subs({olds: s}) if isinstance(rs, sp.Basic): rs = rs.subs({olds: s}) try: if rb < 0: # Negative offset return False except TypeError: # cannot determine truth value of Relational pass #print('WARNING: Cannot evaluate relational expression %s, assuming true.' % (rb > 0), # 'If this expression is false, please refine symbol definitions in the program.') try: if re > s: # Beyond shape return False except TypeError: # cannot determine truth value of Relational pass #print('WARNING: Cannot evaluate relational expression %s, assuming true.' % (re < s), # 'If this expression is false, please refine symbol definitions in the program.') return True # Checks for equivalent shape and type def is_equivalent(self, other): if not isinstance(other, Array): return False # Test type if self.dtype != other.dtype: return False # Test dimensionality if len(self.shape) != len(other.shape): return False # Test shape for dim, otherdim in zip(self.shape, other.shape): # Any other case (constant vs. constant), check for equality if otherdim != dim: return False return True def as_arg(self, with_types=True, for_call=False, name=None): arrname = name if not with_types or for_call: return arrname if self.may_alias: return str(self.dtype.ctype) + ' *' + arrname return str(self.dtype.ctype) + ' * __restrict__ ' + arrname def sizes(self): return [d.name if isinstance(d, symbolic.symbol) else str(d) for d in self.shape] @property def free_symbols(self): result = super().free_symbols for s in self.strides: if isinstance(s, sp.Expr): result |= set(s.free_symbols) if isinstance(self.total_size, sp.Expr): result |= set(self.total_size.free_symbols) for o in self.offset: if isinstance(o, sp.Expr): result |= set(o.free_symbols) return result
class Data(object): """ Data type descriptors that can be used as references to memory. Examples: Arrays, Streams, custom arrays (e.g., sparse matrices). """ dtype = TypeClassProperty(default=dtypes.int32, choices=dtypes.Typeclasses) shape = ShapeProperty(default=[]) transient = Property(dtype=bool, default=False) storage = EnumProperty(dtype=dtypes.StorageType, desc="Storage location", default=dtypes.StorageType.Default) lifetime = EnumProperty(dtype=dtypes.AllocationLifetime, desc='Data allocation span', default=dtypes.AllocationLifetime.Scope) location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)') debuginfo = DebugInfoProperty(allow_none=True) def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo): self.dtype = dtype self.shape = shape self.transient = transient self.storage = storage self.location = location if location is not None else {} self.lifetime = lifetime self.debuginfo = debuginfo self._validate() def validate(self): """ Validate the correctness of this object. Raises an exception on error. """ self._validate() # Validation of this class is in a separate function, so that this # class can call `_validate()` without calling the subclasses' # `validate` function. def _validate(self): if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError('Shape must be a list or tuple of integer values ' 'or symbols') return True def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @property def toplevel(self): return self.lifetime is not dtypes.AllocationLifetime.Scope def copy(self): raise RuntimeError('Data descriptors are unique and should not be copied') def is_equivalent(self, other): """ Check for equivalence (shape and type) of two data descriptors. """ raise NotImplementedError def as_arg(self, with_types=True, for_call=False, name=None): """Returns a string for a C++ function signature (e.g., `int *A`). """ raise NotImplementedError @property def free_symbols(self) -> Set[symbolic.SymbolicType]: """ Returns a set of undefined symbols in this data descriptor. """ result = set() for s in self.shape: if isinstance(s, sp.Basic): result |= set(s.free_symbols) return result def __repr__(self): return 'Abstract Data Container, DO NOT USE' @property def veclen(self): return self.dtype.veclen if hasattr(self.dtype, "veclen") else 1 @property def ctype(self): return self.dtype.ctype def strides_from_layout( self, *dimensions: int, alignment: symbolic.SymbolicType = 1, only_first_aligned: bool = False, ) -> Tuple[Tuple[symbolic.SymbolicType], symbolic.SymbolicType]: """ Returns the absolute strides and total size of this data descriptor, according to the given dimension ordering and alignment. :param dimensions: A sequence of integers representing a permutation of the descriptor's dimensions. :param alignment: Padding (in elements) at the end, ensuring stride is a multiple of this number. 1 (default) means no padding. :param only_first_aligned: If True, only the first dimension is padded with ``alignment``. Otherwise all dimensions are. :return: A 2-tuple of (tuple of strides, total size). """ # Verify dimensions if tuple(sorted(dimensions)) != tuple(range(len(self.shape))): raise ValueError('Every dimension must be given and appear once.') if (alignment < 1) == True or (alignment < 0) == True: raise ValueError('Invalid alignment value') strides = [1] * len(dimensions) total_size = 1 first = True for dim in dimensions: strides[dim] = total_size if not only_first_aligned or first: dimsize = (((self.shape[dim] + alignment - 1) // alignment) * alignment) else: dimsize = self.shape[dim] total_size *= dimsize first = False return (tuple(strides), total_size) def set_strides_from_layout(self, *dimensions: int, alignment: symbolic.SymbolicType = 1, only_first_aligned: bool = False): """ Sets the absolute strides and total size of this data descriptor, according to the given dimension ordering and alignment. :param dimensions: A sequence of integers representing a permutation of the descriptor's dimensions. :param alignment: Padding (in elements) at the end, ensuring stride is a multiple of this number. 1 (default) means no padding. :param only_first_aligned: If True, only the first dimension is padded with ``alignment``. Otherwise all dimensions are. """ strides, totalsize = self.strides_from_layout(*dimensions, alignment=alignment, only_first_aligned=only_first_aligned) self.strides = strides self.total_size = totalsize
class Data(object): """ Data type descriptors that can be used as references to memory. Examples: Arrays, Streams, custom arrays (e.g., sparse matrices). """ dtype = TypeClassProperty() shape = ShapeProperty() transient = Property(dtype=bool) storage = Property(dtype=dace.types.StorageType, desc="Storage location", enum=dace.types.StorageType, default=dace.types.StorageType.Default, from_string=lambda x: types.StorageType[x]) location = Property( dtype=str, # Dict[str, symbolic] desc='Full storage location identifier (e.g., rank, GPU ID)', default='') toplevel = Property(dtype=bool, desc="Allocate array outside of state", default=False) debuginfo = DebugInfoProperty() def __init__(self, dtype, shape, transient, storage, location, toplevel, debuginfo): self.dtype = dtype self.shape = shape self.transient = transient self.storage = storage self.location = location self.toplevel = toplevel self.debuginfo = debuginfo self._validate() def validate(self): """ Validate the correctness of this object. Raises an exception on error. """ self._validate() # Validation of this class is in a separate function, so that this # class can call `_validate()` without calling the subclasses' # `validate` function. def _validate(self): if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError('Shape must be a list or tuple of integer values ' 'or symbols') return True def copy(self): raise RuntimeError( 'Data descriptors are unique and should not be copied') def is_equivalent(self, other): """ Check for equivalence (shape and type) of two data descriptors. """ raise NotImplementedError def signature(self, with_types=True, for_call=False, name=None): """Returns a string for a C++ function signature (e.g., `int *A`). """ raise NotImplementedError def __repr__(self): return 'Abstract Data Container, DO NOT USE'
class CompositeFusion(transformation.SubgraphTransformation): """ MultiExpansion + SubgraphFusion in one Transformation Additional StencilTiling is also possible as a canonicalizing transformation before fusion. """ debug = Property(desc="Debug mode", dtype=bool, default=False) allow_expansion = Property(desc="Allow MultiExpansion first", dtype=bool, default=True) allow_tiling = Property(desc="Allow StencilTiling (after MultiExpansion)", dtype=bool, default=False) transient_allocation = EnumProperty( desc="Storage Location to push transients to that are " "fully contained within the subgraph.", dtype=dtypes.StorageType, default=dtypes.StorageType.Default) schedule_innermaps = Property(desc="Schedule of inner fused maps", dtype=dtypes.ScheduleType, default=None, allow_none=True) stencil_unroll_loops = Property( desc="Unroll inner stencil loops if they have size > 1", dtype=bool, default=False) stencil_strides = ShapeProperty(dtype=tuple, default=(1, ), desc="Stencil tile stride") expansion_split = Property( desc="Allow MultiExpansion to split up maps, if enabled", dtype=bool, default=True) def can_be_applied(self, sdfg: SDFG, subgraph: SubgraphView) -> bool: graph = subgraph.graph if self.allow_expansion == True: subgraph_fusion = SubgraphFusion(subgraph) if subgraph_fusion.can_be_applied(sdfg, subgraph): # try w/o copy first return True expansion = MultiExpansion(subgraph) expansion.permutation_only = not self.expansion_split if expansion.can_be_applied(sdfg, subgraph): # deepcopy graph_indices = [ i for (i, n) in enumerate(graph.nodes()) if n in subgraph ] sdfg_copy = SDFG.from_json(sdfg.to_json()) graph_copy = sdfg_copy.nodes()[sdfg.nodes().index(graph)] subgraph_copy = SubgraphView( graph_copy, [graph_copy.nodes()[i] for i in graph_indices]) ##sdfg_copy.apply_transformations(MultiExpansion, states=[graph]) #expansion = MultiExpansion(subgraph_copy) expansion.apply(sdfg_copy) subgraph_fusion = SubgraphFusion(subgraph_copy) if subgraph_fusion.can_be_applied(sdfg_copy, subgraph_copy): return True stencil_tiling = StencilTiling(subgraph_copy) if self.allow_tiling and stencil_tiling.can_be_applied( sdfg_copy, subgraph_copy): return True else: subgraph_fusion = SubgraphFusion(subgraph) if subgraph_fusion.can_be_applied(sdfg, subgraph): return True if self.allow_tiling == True: stencil_tiling = StencilTiling(subgraph) if stencil_tiling.can_be_applied(sdfg, subgraph): return True return False def apply(self, sdfg): subgraph = self.subgraph_view(sdfg) graph = subgraph.graph scope_dict = graph.scope_dict() map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph, scope_dict) first_entry = next(iter(map_entries)) if self.allow_expansion: expansion = MultiExpansion(subgraph, self.sdfg_id, self.state_id) expansion.permutation_only = not self.expansion_split if expansion.can_be_applied(sdfg, subgraph): expansion.apply(sdfg) sf = SubgraphFusion(subgraph, self.sdfg_id, self.state_id) if sf.can_be_applied(sdfg, self.subgraph_view(sdfg)): # set SubgraphFusion properties sf.debug = self.debug sf.transient_allocation = self.transient_allocation sf.schedule_innermaps = self.schedule_innermaps sf.apply(sdfg) self._global_map_entry = sf._global_map_entry return elif self.allow_tiling == True: st = StencilTiling(subgraph, self.sdfg_id, self.state_id) if st.can_be_applied(sdfg, self.subgraph_view(sdfg)): # set StencilTiling properties st.debug = self.debug st.unroll_loops = self.stencil_unroll_loops st.strides = self.stencil_strides st.apply(sdfg) # StencilTiling: update nodes new_entries = st._outer_entries subgraph = helpers.subgraph_from_maps(sdfg, graph, new_entries) sf = SubgraphFusion(subgraph, self.sdfg_id, self.state_id) # set SubgraphFusion properties sf.debug = self.debug sf.transient_allocation = self.transient_allocation sf.schedule_innermaps = self.schedule_innermaps sf.apply(sdfg) self._global_map_entry = sf._global_map_entry return warnings.warn("CompositeFusion::Apply did not perform as expected")
class StencilTiling(transformation.SubgraphTransformation): """ Operates on top level maps of the given subgraph. Applies orthogonal tiling to each of the maps with the given strides and extends the newly created inner tiles to account for data dependencies due to stencil patterns. For each map all outgoing memlets to an array must cover the memlets that are incoming into a following child map. All maps must have the same map parameters in the same order. """ # Properties debug = Property(desc="Debug mode", dtype=bool, default=False) prefix = Property(dtype=str, default="stencil", desc="Prefix for new inner tiled range symbols") strides = ShapeProperty(dtype=tuple, default=(1, ), desc="Tile stride") schedule = Property(dtype=dace.dtypes.ScheduleType, default=dace.dtypes.ScheduleType.Default, desc="Dace.Dtypes.ScheduleType of Inner Maps") unroll_loops = Property(desc="Unroll Inner Loops if they have Size > 1", dtype=bool, default=False) @staticmethod def coverage_dicts(sdfg, graph, map_entry, outer_range=True): ''' returns a tuple of two dicts: the first dict has as a key all data entering the map and its associated access range the second dict has as a key all data exiting the map and its associated access range if outer_range = True, substitutes outer ranges into min/max of inner access range ''' map_exit = graph.exit_node(map_entry) map = map_entry.map entry_coverage = {} exit_coverage = {} # create dicts with which we can replace all iteration # variable_mapping by their range map_min = { dace.symbol(param): e for param, e in zip(map.params, map.range.min_element()) } map_max = { dace.symbol(param): e for param, e in zip(map.params, map.range.max_element()) } # look at inner memlets at map entry for e in graph.out_edges(map_entry): if outer_range: # get subset min_element = [ m.subs(map_min) for m in e.data.subset.min_element() ] max_element = [ m.subs(map_max) for m in e.data.subset.max_element() ] # create range rng = subsets.Range( (min_e, max_e, 1) for min_e, max_e in zip(min_element, max_element)) else: rng = dcpy(e.data.subset) if e.data.data not in entry_coverage: entry_coverage[e.data.data] = rng else: old_coverage = entry_coverage[e.data.data] entry_coverage[e.data.data] = subsets.union(old_coverage, rng) # look at inner memlets at map exit for e in graph.in_edges(map_exit): if outer_range: # get subset min_element = [ m.subs(map_min) for m in e.data.subset.min_element() ] max_element = [ m.subs(map_max) for m in e.data.subset.max_element() ] # craete range rng = subsets.Range( (min_e, max_e, 1) for min_e, max_e in zip(min_element, max_element)) else: rng = dcpy(e.data.subset) if e.data.data not in exit_coverage: exit_coverage[e.data.data] = rng else: old_coverage = exit_coverage[e.data] exit_coverage[e.data.data] = subsets.union(old_coverage, rng) # return both coverages as a tuple return (entry_coverage, exit_coverage) @staticmethod def topology(sdfg, graph, map_entries): # first get dicts of parents and children for each map_entry # get source maps as a starting point for BFS # these are all map entries reachable from source nodes sink_maps = set() children_dict = defaultdict(set) parent_dict = defaultdict(set) map_exits = {graph.exit_node(entry): entry for entry in map_entries} for map_entry in map_entries: map_exit = graph.exit_node(map_entry) for e in graph.in_edges(map_entry): if isinstance(e.src, nodes.AccessNode): for ie in graph.in_edges(e.src): if ie.src in map_exits: other_entry = map_exits[ie.src] children_dict[other_entry].add(map_entry) parent_dict[map_entry].add(other_entry) out_counter = 0 for e in graph.out_edges(map_exit): if isinstance(e.dst, nodes.AccessNode): for oe in graph.out_edges(e.dst): if oe.dst in map_entries: other_entry = oe.dst children_dict[map_entry].add(other_entry) parent_dict[other_entry].add(map_entry) out_counter += 1 if out_counter == 0: sink_maps.add(map_entry) return (children_dict, parent_dict, sink_maps) @staticmethod def can_be_applied(sdfg, subgraph) -> bool: # get highest scope maps graph = subgraph.graph map_entries = set( helpers.get_outermost_scope_maps(sdfg, graph, subgraph)) # 1.1: There has to be more than one outermost scope map entry if len(map_entries) <= 1: return False # 1.2: check basic constraints: # - all parameters have to be the same (this implies same length) # - no parameter permutations here as ambiguity is very high then # - same strides everywhere first_map = next(iter(map_entries)) params = dcpy(first_map.map.params) strides = first_map.map.range.strides() schedule = first_map.map.schedule for map_entry in map_entries: if map_entry.map.params != params: return False if map_entry.map.range.strides() != strides: return False if map_entry.map.schedule != schedule: return False # 1.3: check whether all map entries only differ by a const amount first_entry = next(iter(map_entries)) for map_entry in map_entries: for r1, r2 in zip(map_entry.map.range, first_entry.map.range): if len((r1[0] - r2[0]).free_symbols) > 0: return False if len((r1[1] - r2[1]).free_symbols) > 0: return False # get intermediate_nodes, out_nodes from SubgraphFusion Transformation node_config = SubgraphFusion.get_adjacent_nodes( sdfg, graph, map_entries) (_, intermediate_nodes, out_nodes) = node_config # 1.4: check topological feasibility if not SubgraphFusion.check_topo_feasibility( sdfg, graph, map_entries, intermediate_nodes, out_nodes): return False # 1.5 nodes that are both intermediate and out nodes # are not supported in StencilTiling if len(intermediate_nodes & out_nodes) > 0: return False # get coverages for every map entry coverages = {} memlets = {} for map_entry in map_entries: coverages[map_entry] = StencilTiling.coverage_dicts( sdfg, graph, map_entry) memlets[map_entry] = StencilTiling.coverage_dicts( sdfg, graph, map_entry, outer_range=False) # get DAG neighbours for each map dag_neighbors = StencilTiling.topology(sdfg, graph, map_entries) (children_dict, _, sink_maps) = dag_neighbors # 1.6: we now check coverage: # each outgoing coverage for a data memlet has to # be exactly equal to the union of incoming coverages # of all chidlren map memlets of this data # important: # 1. it has to be equal and not only cover it in order to # account for ranges too long # 2. we check coverages by map parameter and not by # array, this way it is even more general # 3. map parameter coverages are checked for each # (map_entry, children of this map_entry) - pair for map_entry in map_entries: # get coverage from current map_entry map_coverage = coverages[map_entry][1] # final mapping map_parameter -> coverage will be stored here param_parent_coverage = {p: None for p in map_entry.params} param_children_coverage = {p: None for p in map_entry.params} for child_entry in children_dict[map_entry]: # get mapping data_name -> coverage for (data_name, cov) in map_coverage.items(): parent_coverage = cov children_coverage = None if data_name in coverages[child_entry][0]: children_coverage = subsets.union( children_coverage, coverages[child_entry][0][data_name]) # extend mapping map_parameter -> coverage # by the previous mapping for i, (p_subset, c_subset) in enumerate( zip(parent_coverage, children_coverage)): # transform into subset p_subset = subsets.Range((p_subset, )) c_subset = subsets.Range((c_subset, )) # get associated parameter in memlet params1 = symbolic.symlist( memlets[map_entry][1][data_name][i]).keys() params2 = symbolic.symlist( memlets[child_entry][0][data_name][i]).keys() if params1 != params2: return False params = params1 if len(params) > 1: # this is not supported return False try: symbol = next(iter(params)) param_parent_coverage[symbol] = subsets.union( param_parent_coverage[symbol], p_subset) param_children_coverage[symbol] = subsets.union( param_children_coverage[symbol], c_subset) except StopIteration: # current dim has no symbol associated. # ignore and continue warnings.warn( f"In map {map_entry}, there is a " "dimension belonging to {data_name} " "that has no map parameter associated.") pass # 1.6: parameter mapping must be the same if param_parent_coverage != param_children_coverage: return False # 1.7: we want all sink maps to have the same range size assert len(sink_maps) > 0 first_sink_map = next(iter(sink_maps)) if not all([ map.range.size() == first_sink_map.range.size() for map in sink_maps ]): return False return True def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] subgraph = self.subgraph_view(sdfg) map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph) result = StencilTiling.topology(sdfg, graph, map_entries) (children_dict, parent_dict, sink_maps) = result # next up, calculate inferred ranges for each map # for each map entry, this contains a tuple of dicts: # each of those maps from data_name of the array to # inferred outer ranges. An inferred outer range is created # by taking the union of ranges of inner subsets corresponding # to that data and substituting this subset by the min / max of the # parametrized map boundaries # finally, from these outer ranges we can easily calculate # strides and tile sizes required for every map inferred_ranges = defaultdict(dict) # create array of reverse topologically sorted map entries # to iterate over topo_reversed = [] queue = set(sink_maps.copy()) while len(queue) > 0: element = next(e for e in queue if not children_dict[e] - set(topo_reversed)) topo_reversed.append(element) queue.remove(element) for parent in parent_dict[element]: queue.add(parent) # main loop # first get coverage dicts for each map entry # for each map, contains a tuple of two dicts # each of those two maps from data name to outer range coverage = {} for map_entry in map_entries: coverage[map_entry] = StencilTiling.coverage_dicts( sdfg, graph, map_entry, outer_range=True) # we have a mapping from data name to outer range # however we want a mapping from map parameters to outer ranges # for this we need to find out how all array dimensions map to # outer ranges variable_mapping = defaultdict(list) for map_entry in topo_reversed: map = map_entry.map # first find out variable mapping for e in itertools.chain( graph.out_edges(map_entry), graph.in_edges(graph.exit_node(map_entry))): mapping = [] for dim in e.data.subset: syms = set() for d in dim: syms |= symbolic.symlist(d).keys() if len(syms) > 1: raise NotImplementedError( "One incoming or outgoing stencil subset is indexed " "by multiple map parameters. " "This is not supported yet.") try: mapping.append(syms.pop()) except KeyError: # just append None if there is no map symbol in it. # we don't care for now. mapping.append(None) if e.data in variable_mapping: # assert that this is the same everywhere. # else we might run into problems assert variable_mapping[e.data.data] == mapping else: variable_mapping[e.data.data] = mapping # now do mapping data name -> outer range # and from that infer mapping variable -> outer range local_ranges = {dn: None for dn in coverage[map_entry][1].keys()} for data_name, cov in coverage[map_entry][1].items(): local_ranges[data_name] = subsets.union( local_ranges[data_name], cov) # now look at proceeding maps # and union those subsets -> could be larger with stencil indent for child_map in children_dict[map_entry]: if data_name in coverage[child_map][0]: local_ranges[data_name] = subsets.union( local_ranges[data_name], coverage[child_map][0][data_name]) # final assignent: combine local_ranges and variable_mapping # together into inferred_ranges inferred_ranges[map_entry] = {p: None for p in map.params} for data_name, ranges in local_ranges.items(): for param, r in zip(variable_mapping[data_name], ranges): # create new range from this subset and assign rng = subsets.Range((r, )) if param: inferred_ranges[map_entry][param] = subsets.union( inferred_ranges[map_entry][param], rng) # get parameters -- should all be the same params = next(iter(map_entries)).map.params.copy() # define reference range as inferred range of one of the sink maps self.reference_range = inferred_ranges[next(iter(sink_maps))] if self.debug: print("StencilTiling::Reference Range", self.reference_range) # next up, search for the ranges that don't change invariant_dims = [] for idx, p in enumerate(params): different = False if self.reference_range[p] is None: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) continue for m in map_entries: if inferred_ranges[m][p] != self.reference_range[p]: different = True break if not different: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) # during stripmining, we will create new outer map entries # for easy access self._outer_entries = set() # with inferred_ranges constructed, we can begin to strip mine for map_entry in map_entries: # Retrieve map entry and exit nodes. map = map_entry.map stripmine_subgraph = { StripMining._map_entry: graph.nodes().index(map_entry) } sdfg_id = sdfg.sdfg_id last_map_entry = None original_schedule = map_entry.schedule self.tile_sizes = [] self.tile_offset_lower = [] self.tile_offset_upper = [] # strip mining each dimension where necessary removed_maps = 0 for dim_idx, param in enumerate(map_entry.map.params): # get current_node tile size if dim_idx >= len(self.strides): tile_stride = symbolic.pystr_to_symbolic(self.strides[-1]) else: tile_stride = symbolic.pystr_to_symbolic( self.strides[dim_idx]) trivial = False if dim_idx in invariant_dims: self.tile_sizes.append(tile_stride) self.tile_offset_lower.append(0) self.tile_offset_upper.append(0) else: target_range_current = inferred_ranges[map_entry][param] reference_range_current = self.reference_range[param] min_diff = symbolic.SymExpr(reference_range_current.min_element()[0] \ - target_range_current.min_element()[0]) max_diff = symbolic.SymExpr(target_range_current.max_element()[0] \ - reference_range_current.max_element()[0]) try: min_diff = symbolic.evaluate(min_diff, {}) max_diff = symbolic.evaluate(max_diff, {}) except TypeError: raise RuntimeError("Symbolic evaluation of map " "ranges failed. Please check " "your parameters and match.") self.tile_sizes.append(tile_stride + max_diff + min_diff) self.tile_offset_lower.append( symbolic.pystr_to_symbolic(str(min_diff))) self.tile_offset_upper.append( symbolic.pystr_to_symbolic(str(max_diff))) # get calculated parameters tile_size = self.tile_sizes[-1] dim_idx -= removed_maps # If map or tile sizes are trivial, skip strip-mining map dimension # special cases: # if tile size is trivial AND we have an invariant dimension, skip if tile_size == map.range.size()[dim_idx] and ( dim_idx + removed_maps) in invariant_dims: continue # trivial map: we just continue if map.range.size()[dim_idx] in [0, 1]: continue if tile_size == 1 and tile_stride == 1 and ( dim_idx + removed_maps) in invariant_dims: trivial = True removed_maps += 1 # indent all map ranges accordingly and then perform # strip mining on these. Offset inner maps accordingly afterwards range_tuple = (map.range[dim_idx][0] + self.tile_offset_lower[-1], map.range[dim_idx][1] - self.tile_offset_upper[-1], map.range[dim_idx][2]) map.range[dim_idx] = range_tuple stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, 0) stripmine.tiling_type = 'ceilrange' stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix if not trivial else '' # use tile_stride for both -- we will extend # the inner tiles later stripmine.tile_size = str(tile_stride) stripmine.tile_stride = str(tile_stride) outer_map = stripmine.apply(sdfg) outer_map.schedule = original_schedule # apply to the new map the schedule of the original one map_entry.schedule = self.schedule # if tile stride is 1, we can make a nice simplification by just # taking the overapproximated inner range as inner range # this eliminates the min/max in the range which # enables loop unrolling if tile_stride == 1: map_entry.range[dim_idx] = tuple( symbolic.SymExpr(el._approx_expr) if isinstance( el, symbolic.SymExpr) else el for el in map_entry.range[dim_idx]) # in map_entry: enlarge tiles by upper and lower offset # doing it this way and not via stripmine strides ensures # that the max gets changed as well old_range = map_entry.range[dim_idx] map_entry.range[dim_idx] = ((old_range[0] - self.tile_offset_lower[-1]), (old_range[1] + self.tile_offset_upper[-1]), old_range[2]) # We have to propagate here for correct outer volume and subset sizes _propagate_node(graph, map_entry) _propagate_node(graph, graph.exit_node(map_entry)) # usual tiling pipeline if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src # add last instance of map entries to _outer_entries if last_map_entry: self._outer_entries.add(last_map_entry) # Map Unroll Feature: only unroll if conditions are met: # Only unroll if at least one of the inner map ranges is strictly larger than 1 # Only unroll if strides all are one if self.unroll_loops and all(s == 1 for s in self.strides) and any( s not in [0, 1] for s in map_entry.range.size()): l = len(map_entry.params) if l > 1: subgraph = { MapExpansion.map_entry: graph.nodes().index(map_entry) } trafo_expansion = MapExpansion(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_expansion.apply(sdfg) maps = [map_entry] for _ in range(l - 1): map_entry = graph.out_edges(map_entry)[0].dst maps.append(map_entry) for map in reversed(maps): # MapToForLoop subgraph = { MapToForLoop._map_entry: graph.nodes().index(map) } trafo_for_loop = MapToForLoop(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_for_loop.apply(sdfg) nsdfg = trafo_for_loop.nsdfg # LoopUnroll guard = trafo_for_loop.guard end = trafo_for_loop.after_state begin = next(e.dst for e in nsdfg.out_edges(guard) if e.dst != end) subgraph = { DetectLoop._loop_guard: nsdfg.nodes().index(guard), DetectLoop._loop_begin: nsdfg.nodes().index(begin), DetectLoop._exit_state: nsdfg.nodes().index(end) } transformation = LoopUnroll(0, 0, subgraph, 0) transformation.apply(nsdfg) elif self.unroll_loops: warnings.warn( "Did not unroll loops. Either all ranges are equal to " "one or range difference is symbolic.") self._outer_entries = list(self._outer_entries)
class BufferTiling(transformation.Transformation): """ Implements the buffer tiling transformation. BufferTiling tiles a buffer that is in between two maps, where the preceding map writes to the buffer and the succeeding map reads from it. It introduces additional computations in exchange for reduced memory footprint. Commonly used to make use of shared memory on GPUs. """ _map1_exit = nodes.MapExit(nodes.Map('', [], [])) _array = nodes.AccessNode('') _map2_entry = nodes.MapEntry(nodes.Map('', [], [])) tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") # Returns a list of graphs that represent the pattern @staticmethod def expressions(): return [ sdutil.node_path_graph( BufferTiling._map1_exit, BufferTiling._array, BufferTiling._map2_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] for buf in graph.all_nodes_between(map1_exit, map2_entry): # Check that buffers are AccessNodes. if not isinstance(buf, nodes.AccessNode): return False # Check that buffers are transient. if not sdfg.arrays[buf.data].transient: return False # Check that buffers have exactly 1 input and 1 output edge. if graph.in_degree(buf) != 1: return False if graph.out_degree(buf) != 1: return False # Check that buffers are next to the maps. if graph.in_edges(buf)[0].src != map1_exit: return False if graph.out_edges(buf)[0].dst != map2_entry: return False # Check that the data consumed is provided. provided = graph.in_edges(buf)[0].data.subset consumed = graph.out_edges(buf)[0].data.subset if not provided.covers(consumed): return False # Check that buffers occur only once in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == buf ]) if num_occurrences > 1: return False return True @staticmethod def match_to_str(graph, candidate): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [map1_exit, map2_entry]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map1_exit = graph.nodes()[self.subgraph[self._map1_exit]] map1_entry = graph.entry_node(map1_exit) map2_entry = graph.nodes()[self.subgraph[self._map2_entry]] buffers = graph.all_nodes_between(map1_exit, map2_entry) # Situation: # -> map1_entry -> ... -> map1_exit -> buffers -> map2_entry -> ... lower_extents = tuple(b - a for a, b in zip( map1_entry.range.min_element(), map2_entry.range.min_element())) upper_extents = tuple(a - b for a, b in zip( map1_entry.range.max_element(), map2_entry.range.max_element())) # Tile the first map with overlap MapTilingWithOverlap.apply_to(sdfg, map_entry=map1_entry, options={ 'tile_sizes': self.tile_sizes, 'lower_overlap': lower_extents, 'upper_overlap': upper_extents }) tile_map1_exit = graph.out_edges(map1_exit)[0].dst tile_map1_entry = graph.entry_node(tile_map1_exit) tile_map1_entry.label = 'BufferTiling' # Tile the second map MapTiling.apply_to(sdfg, map_entry=map2_entry, options={ 'tile_sizes': self.tile_sizes, 'tile_trivial': True }) tile_map2_entry = graph.in_edges(map2_entry)[0].src # Fuse maps some_buffer = next( iter(buffers)) # some dummy to pass to MapFusion.apply_to() MapFusion.apply_to(sdfg, first_map_exit=tile_map1_exit, array=some_buffer, second_map_entry=tile_map2_entry) # Optimize the simple cases map1_entry.range.ranges = [ (r[0], r[0], r[2]) if l_ext == 0 and u_ext == 0 and ts == 1 else r for r, l_ext, u_ext, ts in zip(map1_entry.range.ranges, lower_extents, upper_extents, self.tile_sizes) ] map2_entry.range.ranges = [ (r[0], r[0], r[2]) if ts == 1 else r for r, ts in zip(map2_entry.range.ranges, self.tile_sizes) ] if any(ts == 1 for ts in self.tile_sizes): if any(r[0] == r[1] for r in map1_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map1_entry) if any(r[0] == r[1] for r in map2_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map2_entry)
class ProcessGrid(object): """ Process-grids implement cartesian topologies similarly to cartesian communicators created with [MPI_Cart_create](https://www.mpich.org/static/docs/latest/www3/MPI_Cart_create.html) and [MPI_Cart_sub](https://www.mpich.org/static/docs/v3.2/www3/MPI_Cart_sub.html). The boolean property `is_subgrid` provides a switch between "parent" process-grids (equivalent to communicators create with `MPI_Cart_create`) and sub-grids (equivalent to communicators created with `MPI_Cart_sub`). If `is_subgrid` is false, a "parent" process-grid is created. The `shape` property is equivalent to the `dims` parameter of `MPI_Cart_create`. The other properties are ignored. All "parent" process-grids spawn out of `MPI_COMM_WORLD`, while their `periods` and `reorder` parameters are set to False. If `is_subgrid` is true, then the `parent_grid` is partitioned to lower-dimensional cartesian sub-grids (for more details, see the documentation of `MPI_Cart_sub`). The `parent_grid` property is equivalent to the `comm` parameter of `MPI_Cart_sub`. The `color` property corresponds to the `remain_dims` parameter of `MPI_Cart_sub`, i.e., the i-th entry specifies whether the i-th dimension is kep in the sub-grid or is dropped. The following properties store information used in the redistribution of data: The `exact_grid` property is either None or the rank of an MPI process in the `parent_grid`. If set then, out of all the sub-grids created, only the one that contains this rank is used for collective communication. The `root` property is used to select the root rank for purposed of collective communication (by default 0). """ name = Property(dtype=str, desc="The process-grid's name.") is_subgrid = Property( dtype=bool, default=False, desc="If true, spanws sub-grids out of the parent process-grid.") shape = ShapeProperty(default=[], desc="The process-grid's shape.") parent_grid = Property( dtype=str, allow_none=True, default=None, desc="Name of the parent process-grid " "(mandatory if `is_subgrid` is true, otherwise ignored).") color = ListProperty( int, allow_none=True, default=None, desc= "The i-th entry specifies whether the i-th dimension is kept in the sub-grid or is " "dropped (mandatory if `is_subgrid` is true, otherwise ignored).") exact_grid = SymbolicProperty( allow_none=True, default=None, desc= "If set then, out of all the sub-grids created, only the one that contains the " "rank with id `exact_grid` will be utilized for collective communication " "(optional if `is_subgrid` is true, otherwise ignored).") root = SymbolicProperty(default=0, desc="The root rank for collective communication.") def __init__(self, name: str, is_subgrid: bool, shape: ShapeType = None, parent_grid: str = None, color: Sequence[Union[Integral, bool]] = None, exact_grid: RankType = None, root: RankType = 0): self.name = name self.is_subgrid = is_subgrid if is_subgrid: self.parent_grid = parent_grid.name self.color = color self.exact_grid = exact_grid self.shape = [ parent_grid.shape[i] for i, remain in enumerate(color) if remain ] else: self.shape = shape self.root = root self._validate() def validate(self): """ Validate the correctness of this object. Raises an exception on error. """ self._validate() # Validation of this class is in a separate function, so that this # class can call `_validate()` without calling the subclasses' # `validate` function. def _validate(self): if self.is_subgrid: if not self.parent_grid or len(self.parent_grid) == 0: raise ValueError( 'Sub-grid misses its corresponding parent process-grid') if any(not isinstance(s, (Integral, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape): raise TypeError( 'Shape must be a list or tuple of integer values or symbols') if self.color and any(c < 0 or c > 1 for c in self.color): raise ValueError( 'Color must have only logical true (1) or false (0) values.') return True def to_json(self): attrs = serialize.all_properties_to_json(self) retdict = {"type": type(self).__name__, "attributes": attrs} return retdict @classmethod def from_json(cls, json_obj, context=None): # Create dummy object ret = cls('tmp', False, []) serialize.set_properties_from_json(ret, json_obj, context=context) # Check validity now ret.validate() return ret def init_code(self): """ Outputs MPI allocation/initialization code for the process-grid. It is assumed that the following variables exist in the SDFG program's state: - MPI_Comm {self.name}_comm - MPI_Group {self.name}_group - int {self.name}_rank - int {self.name}_size - int* {self.name}_dims - int* {self.name}_remain - int* {self.name}_coords - bool {self.name})_valid These variables are typically added to the program's state through a Tasklet, e.g., the Dummy MPI node (for more details, check the DaCe MPI library in `dace/libraries/mpi`). """ if self.is_subgrid: tmp = "" for i, s in enumerate(self.shape): tmp += f"__state->{self.name}_dims[{i}] = {s};\n" tmp += f""" __state->{self.name}_valid = false; if (__state->{self.parent_grid}_valid) {{ int {self.name}_remain[{len(self.color)}] = {{{', '.join(['1' if c else '0' for c in self.color])}}}; MPI_Cart_sub(__state->{self.parent_grid}_comm, {self.name}_remain, &__state->{self.name}_comm); MPI_Comm_group(__state->{self.name}_comm, &__state->{self.name}_group); MPI_Comm_rank(__state->{self.name}_comm, &__state->{self.name}_rank); MPI_Comm_size(__state->{self.name}_comm, &__state->{self.name}_size); MPI_Cart_coords(__state->{self.name}_comm, __state->{self.name}_rank, {len(self.shape)}, __state->{self.name}_coords); """ if self.exact_grid is not None: tmp += f""" int ranks1[1] = {{{self.exact_grid}}}; int ranks2[1]; MPI_Group_translate_ranks(__state->{self.parent_grid}_group, 1, ranks1, __state->{self.name}_group, ranks2); __state->{self.name}_valid = (ranks2[0] != MPI_PROC_NULL && ranks2[0] != MPI_UNDEFINED); }} """ else: tmp += f""" __state->{self.name}_valid = true; }} """ return tmp else: tmp = "" for i, s in enumerate(self.shape): tmp += f"__state->{self.name}_dims[{i}] = {s};\n" tmp += f""" int {self.name}_periods[{len(self.shape)}] = {{0}}; MPI_Cart_create(MPI_COMM_WORLD, {len(self.shape)}, __state->{self.name}_dims, {self.name}_periods, 0, &__state->{self.name}_comm); if (__state->{self.name}_comm != MPI_COMM_NULL) {{ MPI_Comm_group(__state->{self.name}_comm, &__state->{self.name}_group); MPI_Comm_rank(__state->{self.name}_comm, &__state->{self.name}_rank); MPI_Comm_size(__state->{self.name}_comm, &__state->{self.name}_size); MPI_Cart_coords(__state->{self.name}_comm, __state->{self.name}_rank, {len(self.shape)}, __state->{self.name}_coords); __state->{self.name}_valid = true; }} else {{ __state->{self.name}_group = MPI_GROUP_NULL; __state->{self.name}_rank = MPI_PROC_NULL; __state->{self.name}_size = 0; __state->{self.name}_valid = false; }} """ return tmp def exit_code(self): """ Outputs MPI deallocation code for the process-grid. """ return f"""