def test_intersects_constant(): rng1 = subsets.Range([(0, 4, 1)]) rng2 = subsets.Range([(3, 4, 1)]) rng3 = subsets.Range([(1, 5, 1)]) rng4 = subsets.Range([(5, 7, 1)]) ind1 = subsets.Indices([0]) ind2 = subsets.Indices([1]) ind3 = subsets.Indices([5]) assert subsets.intersects(rng1, rng2) is True assert subsets.intersects(rng1, rng3) is True assert subsets.intersects(rng1, rng4) is False assert subsets.intersects(ind1, rng1) is True assert subsets.intersects(rng1, ind2) is True assert subsets.intersects(rng1, ind3) is False
def cpp_ptr_expr(sdfg, memlet, defined_type, offset=None, relative_offset=True, use_other_subset=False, indices=None, is_write=None): """ Converts a memlet to a C++ pointer expression. """ subset = memlet.subset if not use_other_subset else memlet.other_subset s = subset if relative_offset else subsets.Indices(offset) o = offset if relative_offset else None desc = sdfg.arrays[memlet.data] if isinstance(indices, str): offset_cppstr = indices else: offset_cppstr = cpp_offset_expr(desc, s, o, indices=indices) dname = ptr(memlet.data, desc) if defined_type == DefinedType.ArrayInterface: if is_write is None: raise ValueError("is_write must be set for ArrayInterface.") dname = array_interface_variable(dname, is_write, None) if defined_type == DefinedType.Scalar: dname = '&' + dname if offset_cppstr == '0': return dname else: return '%s + %s' % (dname, offset_cppstr)
def cpp_ptr_expr(sdfg, memlet, offset=None, relative_offset=True, use_other_subset=False, indices=None): """ Converts a memlet to a C++ pointer expression. """ subset = memlet.subset if not use_other_subset else memlet.other_subset s = subset if relative_offset else subsets.Indices(offset) o = offset if relative_offset else None if isinstance(indices, str): offset_cppstr = indices else: offset_cppstr = cpp_offset_expr(sdfg.arrays[memlet.data], s, o, indices=indices) dname = memlet.data if isinstance(sdfg.arrays[dname], data.Scalar): dname = '&' + dname if offset_cppstr == '0': return dname else: return '%s + %s' % (dname, offset_cppstr)
def copy_expr( dispatcher, sdfg, dataname, memlet, offset=None, relative_offset=True, packed_types=False, ): datadesc = sdfg.arrays[dataname] if relative_offset: s = memlet.subset o = offset else: if offset is None: s = None elif not isinstance(offset, subsets.Subset): s = subsets.Indices(offset) else: s = offset o = None if s is not None: offset_cppstr = cpp_offset_expr(datadesc, s, o) else: offset_cppstr = "0" dt = "" expr = dataname def_type, _ = dispatcher.defined_vars.get(dataname) add_offset = offset_cppstr != "0" if def_type in [DefinedType.Pointer, DefinedType.ArrayInterface]: return "{}{}{}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.StreamArray: return "{}[{}]".format(expr, offset_cppstr) elif def_type == DefinedType.FPGA_ShiftRegister: return expr elif def_type in [DefinedType.Scalar, DefinedType.Stream]: if add_offset: raise TypeError("Tried to offset address of scalar {}: {}".format( dataname, offset_cppstr)) if def_type == DefinedType.Scalar: return "{}&{}".format(dt, expr) else: return dataname else: raise NotImplementedError("copy_expr not implemented " "for connector type: {}".format(def_type))
def _ndslice_to_subset(ndslice): is_tuple = [isinstance(x, tuple) for x in ndslice] if not any(is_tuple): return subsets.Indices(ndslice) else: if not all(is_tuple): # If a mix of ranges and indices is found, convert to range for i in range(len(ndslice)): if not is_tuple[i]: ndslice[i] = (ndslice[i], ndslice[i], 1) return subsets.Range(ndslice)
def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None): """ Creates a C++ expression that can be added to a pointer in order to offset it to the beginning of the given subset and offset. :param d: The data structure to use for sizes/strides. :param subset_in: The subset to offset by. :param offset: An additional list of offsets or a Subset object :param packed_veclen: If packed types are targeted, specifies the vector length that the final offset should be divided by. :param indices: A tuple of indices to use for expression. :return: A string in C++ syntax with the correct offset """ subset = copy.deepcopy(subset_in) # Offset according to parameters if offset is not None: if isinstance(offset, subsets.Subset): subset.offset(offset, False) else: subset.offset(subsets.Indices(offset), False) # Then, offset according to array subset.offset(subsets.Indices(d.offset), False) # Obtain start range from offsetted subset indices = indices or ([0] * len(d.strides)) index = subset.at(indices, d.strides) if packed_veclen > 1: index /= packed_veclen return sym2cpp(index)
def pop_dims(subset, dims): popped = [] if isinstance(subset, subsets.Indices): indices = copy.deepcopy(subsets.Indices) for i in dims: popped.append(indices.pop(i)) return subsets.Indices(indices) else: ranges = copy.deepcopy(subset.ranges) tsizes = copy.deepcopy(subset.tile_sizes) for i in dims: r = ranges.pop(i) t = tsizes.pop(i) popped.append((r, t)) new_subset = subsets.Range(ranges) new_subset.tile_sizes = tsizes return new_subset, popped
def cpp_array_expr(sdfg, memlet, with_brackets=True, offset=None, relative_offset=True, packed_veclen=1, use_other_subset=False, indices=None): """ Converts an Indices/Range object to a C++ array access string. """ subset = memlet.subset if not use_other_subset else memlet.other_subset s = subset if relative_offset else subsets.Indices(offset) o = offset if relative_offset else None desc = sdfg.arrays[memlet.data] offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices) if with_brackets: ptrname = ptr(memlet.data, desc) return "%s[%s]" % (ptrname, offset_cppstr) else: return offset_cppstr
def compose_and_push_back(first, second, dims=None, popped=None): if isinstance(first, subsets.Indices): subset = first.new_offset(second, negative=False) else: subset = first.compose(second) if dims and popped and len(dims) == len(popped): if isinstance(first, subsets.Indices): indices = subset.Indices for d, p in zip(reversed(dims), reversed(popped)): indices.insert(d, p) subset = subsets.Indices(indices) else: ranges = subset.ranges tsizes = subset.tile_sizes for d, (r, t) in zip(reversed(dims), reversed(popped)): ranges.insert(d, r) tsizes.insert(d, t) subset = subsets.Range(ranges) subset.tile_sizes = tsizes return subset
def test_intersects_symbolic(): N, M = dace.symbol('N', positive=True), dace.symbol('M', positive=True) rng1 = subsets.Range([(0, N - 1, 1), (0, M - 1, 1)]) rng2 = subsets.Range([(0, 0, 1), (0, 0, 1)]) rng3_1 = subsets.Range([(N, N, 1), (0, 1, 1)]) rng3_2 = subsets.Range([(0, 1, 1), (M, M, 1)]) rng4 = subsets.Range([(N, N, 1), (M, M, 1)]) rng5 = subsets.Range([(0, 0, 1), (M, M, 1)]) rng6 = subsets.Range([(0, N, 1), (0, M, 1)]) rng7 = subsets.Range([(0, N - 1, 1), (N - 1, N, 1)]) ind1 = subsets.Indices([0, 1]) assert subsets.intersects(rng1, rng2) is True assert subsets.intersects(rng1, rng3_1) is False assert subsets.intersects(rng1, rng3_2) is False assert subsets.intersects(rng1, rng4) is False assert subsets.intersects(rng1, rng5) is False assert subsets.intersects(rng6, rng1) is True assert subsets.intersects(rng1, rng7) is None assert subsets.intersects(rng7, rng1) is None assert subsets.intersects(rng1, ind1) is None assert subsets.intersects(ind1, rng1) is None
def copy_expr( dispatcher, sdfg, dataname, memlet, offset=None, relative_offset=True, packed_types=False, ): datadesc = sdfg.arrays[dataname] if relative_offset: s = memlet.subset o = offset else: if offset is None: s = None elif not isinstance(offset, subsets.Subset): s = subsets.Indices(offset) else: s = offset o = None if s is not None: offset_cppstr = cpp_offset_expr(datadesc, s, o, memlet.veclen if packed_types else 1) else: offset_cppstr = "0" dt = "" if memlet.veclen != 1 and not packed_types: offset_cppstr = "(%s) / %s" % (offset_cppstr, sym2cpp(memlet.veclen)) dt = "(dace::vec<%s, %s> *)" % ( datadesc.dtype.ctype, sym2cpp(memlet.veclen), ) expr = dataname def_type = dispatcher.defined_vars.get(dataname) add_offset = offset_cppstr != "0" if def_type == DefinedType.Pointer: return "{}{}{}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.ArrayView: return "{}{}.ptr(){}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.StreamArray: return "{}[{}]".format(expr, offset_cppstr) elif def_type in [ DefinedType.Scalar, DefinedType.Stream, DefinedType.StreamView ]: if add_offset: raise TypeError("Tried to offset address of scalar {}: {}".format( dataname, offset_cppstr)) if def_type == DefinedType.Scalar: return "{}&{}".format(dt, expr) else: return dataname else: raise NotImplementedError("copy_expr not implemented " "for connector type: {}".format(def_type))
def add_indirection_subgraph(sdfg, graph, src, dst, memlet): """ Replaces the specified edge in the specified graph with a subgraph that implements indirection without nested AST memlet objects. """ if not isinstance(memlet, astnodes._Memlet): raise TypeError("Expected memlet to be astnodes._Memlet") indirect_inputs = set() indirect_outputs = set() # Scheme for multi-array indirection: # 1. look for all arrays and accesses, create set of arrays+indices # from which the index memlets will be constructed from # 2. each separate array creates a memlet, of which num_accesses = len(set) # 3. one indirection tasklet receives them all + original array and # produces the right output index/range memlet ######################### # Step 1 accesses = OrderedDict() newsubset = dcpy(memlet.subset) for dimidx, dim in enumerate(memlet.subset): # Range/Index disambiguation direct_assignment = False if not isinstance(dim, tuple): dim = [dim] direct_assignment = True for i, r in enumerate(dim): for expr in sympy.preorder_traversal(r): if symbolic.is_sympy_userfunction(expr): fname = expr.func.__name__ if fname not in accesses: accesses[fname] = [] # Replace function with symbol (memlet local name to-be) if expr.args in accesses[fname]: aindex = accesses[fname].index(expr.args) toreplace = 'index_' + fname + '_' + str(aindex) else: accesses[fname].append(expr.args) toreplace = 'index_' + fname + '_' + str( len(accesses[fname]) - 1) if direct_assignment: newsubset[dimidx] = r.subs(expr, toreplace) else: newsubset[dimidx][i] = r.subs(expr, toreplace) ######################### # Step 2 ind_inputs = {'__ind_' + memlet.local_name} ind_outputs = {'lookup'} # Add accesses to inputs for arrname, arr_accesses in accesses.items(): for i in range(len(arr_accesses)): ind_inputs.add('index_%s_%d' % (arrname, i)) tasklet = nd.Tasklet("Indirection", ind_inputs, ind_outputs) input_index_memlets = [] for arrname, arr_accesses in accesses.items(): arr = memlet.otherdeps[arrname] for i, access in enumerate(arr_accesses): # Memlet to load the indirection index indexMemlet = Memlet(arrname, 1, sbs.Indices(list(access)), 1) input_index_memlets.append(indexMemlet) graph.add_edge(src, None, tasklet, "index_%s_%d" % (arrname, i), indexMemlet) ######################### # Step 3 # Create new tasklet that will perform the indirection indirection_ast = ast.parse("lookup = {arr}[{index}]".format( arr='__ind_' + memlet.local_name, index=', '.join([symbolic.symstr(s) for s in newsubset]))) # Conserve line number of original indirection code tasklet.code = ast.copy_location(indirection_ast.body[0], memlet.ast) # Create transient variable to trigger the indirected load if memlet.num_accesses == 1: storage = sdfg.add_scalar('__' + memlet.local_name + '_value', memlet.data.dtype, transient=True) else: storage = sdfg.add_array('__' + memlet.local_name + '_value', memlet.data.dtype, storage=types.StorageType.Default, transient=True, shape=memlet.bounding_box_size()) indirectRange = sbs.Range([(0, s - 1, 1) for s in storage.shape]) dataNode = nd.AccessNode('__' + memlet.local_name + '_value') # Create memlet that depends on the full array that we look up in fullRange = sbs.Range([(0, s - 1, 1) for s in memlet.data.shape]) fullMemlet = Memlet(memlet.dataname, memlet.num_accesses, fullRange, memlet.veclen) graph.add_edge(src, None, tasklet, '__ind_' + memlet.local_name, fullMemlet) # Memlet to store the final value into the transient, and to load it into # the tasklet that needs it indirectMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(tasklet, 'lookup', dataNode, None, indirectMemlet) valueMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(dataNode, None, dst, memlet.local_name, valueMemlet)
def expansion(node, parent_state, parent_sdfg): inp_buffer, out_buffer = node.validate(parent_sdfg, parent_state) redistr = parent_sdfg.rdistrarrays[node.redistr] array_a = parent_sdfg.subarrays[redistr.array_a] array_b = parent_sdfg.subarrays[redistr.array_b] inp_symbols = [ symbolic.symbol(f"__inp_s{i}") for i in range(len(inp_buffer.shape)) ] out_symbols = [ symbolic.symbol(f"__out_s{i}") for i in range(len(out_buffer.shape)) ] inp_subset = subsets.Indices(inp_symbols) out_subset = subsets.Indices(out_symbols) inp_offset = cpp.cpp_offset_expr(inp_buffer, inp_subset) out_offset = cpp.cpp_offset_expr(out_buffer, out_subset) print(inp_offset) print(out_offset) inp_repl = "" for i, s in enumerate(inp_symbols): inp_repl += f"int {s} = __state->{node.redistr}_self_src[__idx * {len(inp_buffer.shape)} + {i}];\n" out_repl = "" for i, s in enumerate(out_symbols): out_repl += f"int {s} = __state->{node.redistr}_self_dst[__idx * {len(out_buffer.shape)} + {i}];\n" copy_args = ", ".join([ f"__state->{node.redistr}_self_size[__idx * {len(inp_buffer.shape)} + {i}], {istride}, {ostride}" for i, (istride, ostride ) in enumerate(zip(inp_buffer.strides, out_buffer.strides)) ]) code = f""" int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Request* req = new MPI_Request[__state->{node._redistr}_sends]; MPI_Status* status = new MPI_Status[__state->{node._redistr}_sends]; MPI_Status recv_status; if (__state->{array_a.pgrid}_valid) {{ for (auto __idx = 0; __idx < __state->{node._redistr}_sends; ++__idx) {{ // printf("({redistr.array_a} -> {redistr.array_b}) I am rank %d and I send to %d\\n", myrank, __state->{node._redistr}_dst_ranks[__idx]); // fflush(stdout); MPI_Isend(_inp_buffer, 1, __state->{node._redistr}_send_types[__idx], __state->{node._redistr}_dst_ranks[__idx], 0, MPI_COMM_WORLD, &req[__idx]); }} }} if (__state->{array_b.pgrid}_valid) {{ for (auto __idx = 0; __idx < __state->{node._redistr}_self_copies; ++__idx) {{ // printf("({redistr.array_a} -> {redistr.array_b}) I am rank %d and I self-copy\\n", myrank); // fflush(stdout); {inp_repl} {out_repl} dace::CopyNDDynamic<{inp_buffer.dtype.ctype}, 1, false, {len(inp_buffer.shape)}>::Dynamic::Copy( _inp_buffer + {inp_offset}, _out_buffer + {out_offset}, {copy_args} ); }} for (auto __idx = 0; __idx < __state->{node._redistr}_recvs; ++__idx) {{ // printf("({redistr.array_a} -> {redistr.array_b}) I am rank %d and I receive from %d\\n", myrank, __state->{node._redistr}_src_ranks[__idx]); // fflush(stdout); MPI_Recv(_out_buffer, 1, __state->{node._redistr}_recv_types[__idx], __state->{node._redistr}_src_ranks[__idx], 0, MPI_COMM_WORLD, &recv_status); }} }} if (__state->{array_a.pgrid}_valid) {{ MPI_Waitall(__state->{node._redistr}_sends, req, status); delete[] req; delete[] status; }} // printf("I am rank %d and I finished the redistribution {redistr.array_a} -> {redistr.array_b}\\n", myrank); // fflush(stdout); """ tasklet = nodes.Tasklet(node.name, node.in_connectors, node.out_connectors, code, language=dtypes.Language.CPP) return tasklet
def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] expr_index = self.expr_index graph = sdfg.nodes()[self.state_id] tasklet = gnode(MapReduceFusion._tasklet) tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] elif expr_index == 1: # Reduce with outer map rmap_out_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_entry]] rmap_out_exit = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_exit]] rmap_in_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] rmap_tasklet = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_tasklet]] if expr_index == 2: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] else: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._rmap_in_cr]] out_array = gnode(MapReduceFusion._out_array) # Set nodes to remove according to the expression index nodes_to_remove = [in_array] if expr_index == 0: nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) elif expr_index == 1: nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_exit)) else: nodes_to_remove.append(gnode(MapReduceFusion._reduce)) # If no other edges lead to mapexit, remove it. Otherwise, keep # it and remove reduction incoming/outgoing edges if expr_index != 2 and len(graph.in_edges(tmap_exit)) == 1: nodes_to_remove.append(tmap_exit) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') if expr_index == 0: # Reduce without outer map # Index order does not matter, merge as-is pass elif expr_index == 1: # Reduce with outer map tmap = tmap_exit.map perm_outer, perm_inner = MapReduceFusion.find_permutation( tmap, rmap_out_entry.map, rmap_in_entry.map, memlet_edge.data) # Split tasklet map into tmap_out -> tmap_in (according to # reduction) omap = nodes.Map( tmap.label + '_nonreduce', [p for i, p in enumerate(tmap.params) if i in perm_outer], [r for i, r in enumerate(tmap.range) if i in perm_outer], tmap.schedule, tmap.unroll, tmap.is_async) tmap.params = [ p for i, p in enumerate(tmap.params) if i in perm_inner ] tmap.range = [ r for i, r in enumerate(tmap.range) if i in perm_inner ] omap_entry = nodes.MapEntry(omap) omap_exit = rmap_out_exit rmap_out_exit.map = omap # Reconnect graph to new map tmap_entry = graph.entry_node(tmap_exit) tmap_in_edges = list(graph.in_edges(tmap_entry)) for e in tmap_in_edges: nxutil.change_edge_dest(graph, tmap_entry, omap_entry) for e in tmap_in_edges: graph.add_edge(omap_entry, e.src_conn, tmap_entry, e.dst_conn, copy.copy(e.data)) elif expr_index == 2: # Reduce node # Find correspondence between map indices and array outputs tmap = tmap_exit.map perm = MapReduceFusion.find_permutation_reduce( tmap, rmap_cr, graph, memlet_edge.data) output_subset = [tmap.params[d] for d in perm] if len(output_subset) == 0: # Output is a scalar output_subset = [0] array_edge = graph.out_edges(rmap_cr)[0] # Delete relevant edges and nodes graph.remove_edge(memlet_edge) graph.remove_nodes_from(nodes_to_remove) # Add new edges and nodes # From tasklet to map exit graph.add_edge( memlet_edge.src, memlet_edge.src_conn, memlet_edge.dst, memlet_edge.dst_conn, Memlet(out_array.data, memlet_edge.data.num_accesses, subsets.Indices(output_subset), memlet_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) # From map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) return # Remove tmp array node prior to the others, so that a new one # can be created in its stead (see below) graph.remove_node(nodes_to_remove[0]) nodes_to_remove = nodes_to_remove[1:] # Create tasklet -> tmp -> tasklet connection tmp = graph.add_array( 'tmp', memlet_edge.data.subset.bounding_box_size(), sdfg.arrays[memlet_edge.data.data].dtype, transient=True) tasklet_tmp_memlet = copy.deepcopy(memlet_edge.data) tasklet_tmp_memlet.data = tmp.data tasklet_tmp_memlet.subset = ShapeProperty.to_string(tmp.shape) # Modify memlet to point to output array memlet_edge.data.data = out_array.data # Recover reduction axes from CR reduce subset reduce_cr_subset = graph.in_edges(rmap_tasklet)[0].data.subset reduce_axes = [] for ind, crvar in enumerate(reduce_cr_subset.indices): if '__i' in str(crvar): reduce_axes.append(ind) # Modify memlet access index by filtering out reduction axes if True: # expr_index == 0: newindices = [] for ind, ovar in enumerate(memlet_edge.data.subset.indices): if ind not in reduce_axes: newindices.append(ovar) if len(newindices) == 0: newindices = [0] memlet_edge.data.subset = subsets.Indices(newindices) graph.remove_edge(memlet_edge) graph.add_edge(memlet_edge.src, memlet_edge.src_conn, tmp, memlet_edge.dst_conn, tasklet_tmp_memlet) red_edges = list(graph.in_edges(rmap_tasklet)) if len(red_edges) != 1: raise RuntimeError('CR edge must be unique') tmp_tasklet_memlet = copy.deepcopy(tasklet_tmp_memlet) graph.add_edge(tmp, None, rmap_tasklet, red_edges[0].dst_conn, tmp_tasklet_memlet) for e in graph.edges_between(rmap_tasklet, rmap_cr): e.data.subset = memlet_edge.data.subset # Move output edges to point directly to CR node if expr_index == 1: # Set output memlet between CR node and outer reduction map to # contain the same subset as the one pointing to the CR node for e in graph.out_edges(rmap_cr): e.data.subset = memlet_edge.data.subset rmap_out = gnode(MapReduceFusion._rmap_out_exit) nxutil.change_edge_src(graph, rmap_out, omap_exit) # Remove nodes graph.remove_nodes_from(nodes_to_remove) # For unrelated outputs, connect original output to rmap_out if expr_index == 1 and tmap_exit not in nodes_to_remove: other_out_edges = list(graph.out_edges(tmap_exit)) for e in other_out_edges: graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, omap_exit, None, e.data) graph.add_edge(omap_exit, None, e.dst, e.dst_conn, copy.copy(e.data))
def apply(self, sdfg): """ The method creates two nested maps. The inner map ranges over the reduction axes, while the outer map ranges over the rest of the input dimensions. The inner map contains a trivial tasklet, while the outgoing edges copy the reduction WCR. """ graph = sdfg.nodes()[self.state_id] red_node = graph.nodes()[self.subgraph[ReduceExpansion._reduce]] inputs = [] in_memlets = [] for src, _, _, _, memlet in graph.in_edges(red_node): if src not in inputs: inputs.append(src) in_memlets.append(memlet) if len(inputs) > 1: raise NotImplementedError outputs = [] out_memlets = [] for _, _, dst, _, memlet in graph.out_edges(red_node): if dst not in outputs: outputs.append(dst) out_memlets.append(memlet) if len(outputs) > 1: raise NotImplementedError axes = red_node.axes if axes is None: axes = tuple(i for i in range(in_memlets[0].subset.dims())) outer_map_range = {} inner_map_range = {} for idx, r in enumerate(in_memlets[0].subset): if idx in axes: inner_map_range.update({ "__dim_{}".format(str(idx)): subsets.Range.dim_to_string(r) }) else: outer_map_range.update({ "__dim_{}".format(str(idx)): subsets.Range.dim_to_string(r) }) if len(outer_map_range) > 0: outer_map_entry, outer_map_exit = graph.add_map( 'reduce_outer', outer_map_range, schedule=red_node.schedule) inner_map_entry, inner_map_exit = graph.add_map( 'reduce_inner', inner_map_range, schedule=(dtypes.ScheduleType.Default if len(outer_map_range) > 0 else red_node.schedule)) tasklet = graph.add_tasklet(name='red_tasklet', inputs={'in_1'}, outputs={'out_1'}, code='out_1 = in_1') inner_map_entry.in_connectors = {'IN_1'} inner_map_entry.out_connectors = {'OUT_1'} outer_in_memlet = dcpy(in_memlets[0]) if len(outer_map_range) > 0: outer_map_entry.in_connectors = {'IN_1'} outer_map_entry.out_connectors = {'OUT_1'} graph.add_edge(inputs[0], None, outer_map_entry, 'IN_1', outer_in_memlet) else: graph.add_edge(inputs[0], None, inner_map_entry, 'IN_1', outer_in_memlet) med_in_memlet = dcpy(in_memlets[0]) med_in_range = [] for idx, r in enumerate(med_in_memlet.subset): if idx in axes: med_in_range.append(r) else: med_in_range.append(("__dim_{}".format(str(idx)), "__dim_{}".format(str(idx)), 1)) med_in_memlet.subset = subsets.Range(med_in_range) med_in_memlet.num_accesses = med_in_memlet.subset.num_elements() if len(outer_map_range) > 0: graph.add_edge(outer_map_entry, 'OUT_1', inner_map_entry, 'IN_1', med_in_memlet) inner_in_memlet = dcpy(med_in_memlet) inner_in_idx = [] for idx in range(len(inner_in_memlet.subset)): inner_in_idx.append("__dim_{}".format(str(idx))) inner_in_memlet.subset = subsets.Indices(inner_in_idx) inner_in_memlet.num_accesses = inner_in_memlet.subset.num_elements() graph.add_edge(inner_map_entry, 'OUT_1', tasklet, 'in_1', inner_in_memlet) inner_map_exit.in_connectors = {'IN_1'} inner_map_exit.out_connectors = {'OUT_1'} inner_out_memlet = dcpy(out_memlets[0]) inner_out_idx = [] for idx, r in enumerate(inner_in_memlet.subset): if idx not in axes: inner_out_idx.append(r) if len(inner_out_idx) == 0: inner_out_idx = [0] inner_out_memlet.subset = subsets.Indices(inner_out_idx) inner_out_memlet.wcr = red_node.wcr inner_out_memlet.num_accesses = inner_out_memlet.subset.num_elements() graph.add_edge(tasklet, 'out_1', inner_map_exit, 'IN_1', inner_out_memlet) outer_out_memlet = dcpy(out_memlets[0]) outer_out_range = [] for idx, r in enumerate(outer_out_memlet.subset): if idx not in axes: outer_out_range.append(r) if len(outer_out_range) == 0: outer_out_range = [(0, 0, 1)] outer_out_memlet.subset = subsets.Range(outer_out_range) outer_out_memlet.wcr = red_node.wcr if len(outer_map_range) > 0: outer_map_exit.in_connectors = {'IN_1'} outer_map_exit.out_connectors = {'OUT_1'} med_out_memlet = dcpy(inner_out_memlet) med_out_memlet.num_accesses = med_out_memlet.subset.num_elements() graph.add_edge(inner_map_exit, 'OUT_1', outer_map_exit, 'IN_1', med_out_memlet) graph.add_edge(outer_map_exit, 'OUT_1', outputs[0], None, outer_out_memlet) else: graph.add_edge(inner_map_exit, 'OUT_1', outputs[0], None, outer_out_memlet) graph.remove_edge(graph.in_edges(red_node)[0]) graph.remove_edge(graph.out_edges(red_node)[0]) graph.remove_node(red_node)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] node_schedprop = cnode.map exit_nodes = graph.exit_nodes(cnode) else: cnode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] node_schedprop = cnode exit_nodes = [cnode] # Change schedule node_schedprop._schedule = dtypes.ScheduleType.GPU_Device if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.GPU_Stack, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.scope_dict()[edge.src] for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array( node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data]
def _validate_subsets(edge: graph.MultiConnectorEdge, arrays: typing.Dict[str, data.Data], src_name: str = None, dst_name: str = None) -> typing.Tuple[subsets.Subset]: """ Extracts and validates src and dst subsets from the edge. """ # Find src and dst names if not src_name and isinstance(edge.src, nodes.AccessNode): src_name = edge.src.data if not dst_name and isinstance(edge.dst, nodes.AccessNode): dst_name = edge.dst.data if not src_name and not dst_name: raise NotImplementedError # Find the src and dst subsets (deep-copy to allow manipulation) src_subset = copy.deepcopy(edge.data.src_subset) dst_subset = copy.deepcopy(edge.data.dst_subset) if not src_subset and not dst_subset: # NOTE: This should never happen raise NotImplementedError # NOTE: If any of the subsets is None, it means that we proceed in # experimental mode. The base case here is that we just copy the other # subset. However, if we can locate the other array, we check the # dimensionality of the subset and we pop or pad indices/ranges accordingly. # In that case, we also set the subset to start from 0 in each dimension. if not src_subset: if src_name: desc = arrays[src_name] if isinstance(desc, data.View) or edge.data.data == dst_name: src_subset = subsets.Range.from_array(desc) src_expr = src_subset.num_elements() src_expr_exact = src_subset.num_elements_exact() dst_expr = dst_subset.num_elements() dst_expr_exact = dst_subset.num_elements_exact() if (src_expr != dst_expr and symbolic.inequal_symbols( src_expr_exact, dst_expr_exact)): raise ValueError( "Source subset is missing (dst_subset: {}, " "src_shape: {}".format(dst_subset, desc.shape)) else: src_subset = copy.deepcopy(dst_subset) padding = len(desc.shape) - len(src_subset) if padding != 0: if padding > 0: if isinstance(src_subset, subsets.Indices): indices = [0] * padding + src_subset.indices src_subset = subsets.Indices(indices) elif isinstance(src_subset, subsets.Range): ranges = [(0, 0, 1)] * padding + src_subset.ranges src_subset = subsets.Range(ranges) elif padding < 0: if isinstance(src_subset, subsets.Indices): indices = src_subset.indices[-padding:] src_subset = subsets.Indices(indices) elif isinstance(src_subset, subsets.Range): ranges = src_subset.ranges[-padding:] src_subset = subsets.Range(ranges) src_subset.offset(src_subset, True) elif not dst_subset: if dst_name: desc = arrays[dst_name] if isinstance(desc, data.View) or edge.data.data == src_name: dst_subset = subsets.Range.from_array(desc) src_expr = src_subset.num_elements() src_expr_exact = src_subset.num_elements_exact() dst_expr = dst_subset.num_elements() dst_expr_exact = dst_subset.num_elements_exact() if (src_expr != dst_expr and symbolic.inequal_symbols( src_expr_exact, dst_expr_exact)): raise ValueError( "Destination subset is missing (src_subset: {}, " "dst_shape: {}".format(src_subset, desc.shape)) else: dst_subset = copy.deepcopy(src_subset) padding = len(desc.shape) - len(dst_subset) if padding != 0: if padding > 0: if isinstance(dst_subset, subsets.Indices): indices = [0] * padding + dst_subset.indices dst_subset = subsets.Indices(indices) elif isinstance(dst_subset, subsets.Range): ranges = [(0, 0, 1)] * padding + dst_subset.ranges dst_subset = subsets.Range(ranges) elif padding < 0: if isinstance(dst_subset, subsets.Indices): indices = dst_subset.indices[-padding:] dst_subset = subsets.Indices(indices) elif isinstance(dst_subset, subsets.Range): ranges = dst_subset.ranges[-padding:] dst_subset = subsets.Range(ranges) dst_subset.offset(dst_subset, True) return src_subset, dst_subset
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] node_a = self.node_a(sdfg) node_b = self.node_b(sdfg) # Determine direction of new memlet scope_dict = graph.scope_dict() propagate_forward = sd.scope_contains_scope(scope_dict, node_a, node_b) array = self.array if array is None or len(array) == 0: array = next(e.data.data for e in graph.edges_between(node_a, node_b) if e.data.data is not None and e.data.wcr is None) original_edge = None invariant_memlet = None for edge in graph.edges_between(node_a, node_b): if array == edge.data.data: original_edge = edge invariant_memlet = edge.data break if invariant_memlet is None: for edge in graph.edges_between(node_a, node_b): original_edge = edge invariant_memlet = edge.data warnings.warn('Array %s not found! Using array %s instead.' % (array, invariant_memlet.data)) array = invariant_memlet.data break if invariant_memlet is None: raise NameError('Array %s not found!' % array) # Add transient array new_data, _ = sdfg.add_array('trans_' + invariant_memlet.data, [ symbolic.overapproximate(r) for r in invariant_memlet.bounding_box_size() ], sdfg.arrays[invariant_memlet.data].dtype, transient=True, find_new_name=True) data_node = nodes.AccessNode(new_data) # Store as fields so that other transformations can use them self._local_name = new_data self._data_node = data_node to_data_mm = copy.deepcopy(invariant_memlet) from_data_mm = copy.deepcopy(invariant_memlet) offset = subsets.Indices([r[0] for r in invariant_memlet.subset]) # Reconnect, assuming one edge to the access node graph.remove_edge(original_edge) if propagate_forward: graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) new_edge = graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) else: new_edge = graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) # Offset all edges in the memlet tree (including the new edge) for edge in graph.memlet_tree(new_edge): edge.data.subset.offset(offset, True) edge.data.data = new_data return data_node
def copy_expr( dispatcher, sdfg, dataname, memlet, is_write=None, # Otherwise it's a read offset=None, relative_offset=True, packed_types=False, ): datadesc = sdfg.arrays[dataname] if relative_offset: s = memlet.subset o = offset else: if offset is None: s = None elif not isinstance(offset, subsets.Subset): s = subsets.Indices(offset) else: s = offset o = None if s is not None: offset_cppstr = cpp_offset_expr(datadesc, s, o) else: offset_cppstr = "0" dt = "" expr = ptr(dataname, datadesc) def_type, _ = dispatcher.defined_vars.get(dataname) add_offset = offset_cppstr != "0" if def_type in [DefinedType.Pointer, DefinedType.ArrayInterface]: if def_type == DefinedType.ArrayInterface: # If this is a view, it has already been renamed if not isinstance(datadesc, data.View): if is_write is None: raise ValueError( "is_write must be set for ArrayInterface.") expr = array_interface_variable(expr, is_write, dispatcher) return "{}{}{}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.StreamArray: return "{}[{}]".format(expr, offset_cppstr) elif def_type == DefinedType.FPGA_ShiftRegister: return expr elif def_type in [DefinedType.Scalar, DefinedType.Stream]: if add_offset: raise TypeError("Tried to offset address of scalar {}: {}".format( dataname, offset_cppstr)) if def_type == DefinedType.Scalar: return "{}&{}".format(dt, expr) else: return dataname else: raise NotImplementedError("copy_expr not implemented " "for connector type: {}".format(def_type))