def index_to_ijk(self, index: List[int]): """ Creates a string of the access (for variable name generation). :param index: access :return: created string """ # current implementation only supports 3 dimension (default) if len(index) == 3: """ # v1: return "[i{},j{},k{}]".format( "" if index[0] == 0 else "+{}".format(index[0]), "" if index[1] == 0 else "+{}".format(index[1]), "" if index[2] == 0 else "+{}".format(index[2]) ) # v2: return "_{}_{}_{}".format(index[0], index[1], index[2]) """ # compute absolute index ind = stencilflow.convert_3d_to_1d(dimensions=self.dimensions, index=index) # return formatted string return "_{}".format(ind) if ind >= 0 else "_n{}".format(abs(ind)) else: raise NotImplementedError( "Method index_to_ijk has not been implemented for |indices|!=3, here: |indices|={}" .format(len(index)))
def set_up_dist_to_center(self): """ Computes for all fields/channels the distance from the furthest field access to the center of the stencil ([0,0,0,]). """ for item in self.graph.accesses: furthest = max(self.graph.accesses[item]) self.dist_to_center[item] = stencilflow.convert_3d_to_1d( dimensions=self.dimensions, index=furthest)
def setup_internal_buffers(self) -> None: """ Create and split the internal buffers according to the pipline model (see paper example ref# TODO) :return: """ # remove duplicate accesses for item in self.graph.accesses: self.graph.accesses[item] = self.remove_duplicate_accesses( self.graph.accesses[item]) # slice the internal buffer into junks of accesses for buf_name in self.graph.buffer_size: # create empty list and sort the accesses according to their relative position self.internal_buffer[buf_name]: List[BoundedQueue] = list() list.sort(self.graph.accesses[buf_name], reverse=True) # split according to the cases if len(self.graph.accesses[buf_name]) == 0: # empty list pass elif len(self.graph.accesses[buf_name]) == 1: # single entry list # this line would add an additional internal buffer for fields that only have a single access self.internal_buffer[buf_name].append( BoundedQueue(name=buf_name, maxsize=1, collection=[None])) else: # many entry list # iterate through all of them and split them into correct sizes itr = self.graph.accesses[buf_name].__iter__() pre = itr.__next__() for item in itr: curr = item # calculate size of buffer diff = abs( stencilflow.convert_3d_to_1d( index=stencilflow.list_subtract_cwise(pre, curr), dimensions=self.dimensions)) if diff == 0: # two accesses on same field pass else: self.internal_buffer[buf_name].append( BoundedQueue(name=buf_name, maxsize=diff, collection=[None] * diff)) pre = curr
def __init__(self, name: str, kernel_string: str, dimensions: List[int], data_type: dace.dtypes.typeclass, boundary_conditions: Dict[str, Dict[str, str]], raw_inputs, vectorization: int = 1, plot_graph: bool = False, verbose: bool = False) -> None: """ :param name: name of the kernel :param kernel_string: mathematical expression representing the stencil computation :param dimensions: global dimensions / problem size (i.e. size of the input array :param data_type: data type of the result produced by this kernel :param boundary_conditions: dictionary of the boundary condition for each input channel/field :param plot_graph: flag indicating whether the underlying graph is being drawn :param verbose: flag for console output logging """ # initialize the superclass super().__init__(name, BoundedQueue(name="dummy", maxsize=0), data_type) # store arguments self.kernel_string: str = kernel_string # raw kernel string input self.raw_inputs = raw_inputs self.dimensions: List[ int] = dimensions # input array dimensions [dimX, dimY, dimZ] self.boundary_conditions: Dict[str, Dict[ str, str]] = boundary_conditions # boundary_conditions[field_name] self.verbose = verbose self.vectorization = vectorization # read static parameters from config self.config: Dict = stencilflow.parse_json("kernel.config") self.calculator: Calculator = Calculator() # set simulator initial parameters self.all_available = False self.not_available = set() # analyze input self.graph: ComputeGraph = ComputeGraph(vectorization=vectorization, dimensions=dimensions, raw_inputs=raw_inputs) self.graph.generate_graph( kernel_string ) # generate the ast computation graph from the mathematical expression self.graph.calculate_latency( ) # calculate the latency in the computation tree to find the critical path self.graph.determine_inputs_outputs( ) # sort out input nodes (field accesses and constant values) and output # nodes self.graph.setup_internal_buffers() # set plot path (if plot is set to True) if plot_graph: self.graph.plot_graph(name + ".png") # init sim specific params self.var_map: Dict[str, float] = dict( ) # mapping between variable names and its (current) value: var_map[var_name] = # var_value self.read_success: bool = False # flag indicating if read has been successful from all input nodes (=> ready # to execute) self.exec_success: bool = False # flag indicating if the execution has been successful self.result: float = float( 'nan' ) # execution result of current iteration (see program counter) self.outputs: Dict[str, BoundedQueue] = dict() # output delay queue: for simulation of calculation latency, fill it up with bubbles self.out_delay_queue: BoundedQueue = BoundedQueue( name="delay_output", maxsize=self.graph.max_latency + 1, collection=[None] * self.graph.max_latency) # setup internal buffer queues self.internal_buffer: Dict[str, BoundedQueue] = dict() self.setup_internal_buffers() # this method takes care of the (falsely) executed kernel in case of not having a field access at [0,0,0] # present and the implication that there might be only fields out of bound s.t. there is a result produced, # but there should not be a result yet (see paper example ref# TODO) self.dist_to_center: Dict = dict() self.set_up_dist_to_center() self.center_reached = False # add performance metric fields self.max_del_buf_usage = dict() # for mean self.buf_usage_sum = dict() self.buf_usage_num = dict() self.init_metric = False self.PC_exec_start = stencilflow.convert_3d_to_1d( dimensions=self.dimensions, index=self.dimensions) # upper bound self.PC_exec_end = 0 # lower bound
def iter_comp_tree(self, node: BaseOperationNodeClass, index_relative_to_center=True, replace_negative_index=False, python_syntax=False, flatten_index=True, output_dimensions=None) -> str: """ Iterate through the computation tree in order to generate the kernel string (according to some properties e.g. relative to center or replace negative index. :param node: current node in the tree :param index_relative_to_center: indication wheter the zero index should be at the center of the stencil or the furthest element :param replace_negative_index: replace the negativ sign '-' by n in order to create variable names that are not being split up by the python expression parser (Calculator) :return: computation string of the subgraph """ # get predecessor list pred = list(self.graph.graph.pred[node]) # differentiate cases for each node type if isinstance(node, BinOp): # binary operation # extract expression elements if len(pred) == 1: # lhs == rhs: lhs, rhs = pred[0], pred[0] else: lhs = pred[0] # left hand side rhs = pred[1] # right hand side # recursively compute the child string lhs_str = self.iter_comp_tree(lhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) rhs_str = self.iter_comp_tree(rhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) # return formatted string return "({} {} {})".format(lhs_str, node.generate_op_sym(), rhs_str) elif isinstance(node, Call): # function call # extract expression element expr = pred[0] # recursively compute the child string expr_str = self.iter_comp_tree(expr, index_relative_to_center, replace_negative_index, python_syntax) # return formatted string return "{}({})".format(node.name, expr_str) elif isinstance(node, Name) or isinstance(node, Num): # return formatted string return str(node.name) # variable name elif isinstance(node, Subscript): # compute correct indexing according to the flag if index_relative_to_center: dim_index = node.index else: dim_index = stencilflow.list_subtract_cwise( node.index, self.graph.max_index[node.name]) # break down index from 3D (i.e. [X,Y,Z]) to 1D if flatten_index: # TODO if node.name in self.input_paths and self.inputs[ node.name]["input_dims"] is not None: ind = [ x if x in self.inputs[node.name]["input_dims"] else None for x in stencilflow.ITERATORS ] num_dim = stencilflow.num_dims(ind) #dim_index = dim_index[len(self.dimensions) - num_dim:] new_ind, i = list(), 0 for entry in ind: if entry is None: new_ind.append(None) else: new_ind.append(dim_index[i]) i += 1 dim_index = dim_index #list(map(lambda x, y: y if x is not None else None, ind, new_ind)) word_index = stencilflow.convert_3d_to_1d( dimensions=self.dimensions, index=dim_index) # replace negative sign if the flag is set if replace_negative_index and word_index < 0: return node.name + "[" + "n" + str(abs(word_index)) + "]" else: return node.name + "[" + str(word_index) + "]" else: try: dim_index = [ dim_index[stencilflow.ITERATORS.index(i)] for i in self.inputs[node.name]["input_dims"] ] except (KeyError, TypeError): pass # input_dim not defined or is None if len(dim_index) > output_dimensions: for i in range(3 - output_dimensions): if dim_index[i] != 0: raise ValueError("Removed used index dimension") dim_index = dim_index[3 - output_dimensions:] return node.name + str(dim_index) elif isinstance( node, Ternary ): # ternary operator of the form true_expr if comp else false_expr # extract expression elements compare = [x for x in pred if type(x) == Compare][0] # comparison lhs = [x for x in pred if type(x) != Compare][0] # left hand side rhs = [x for x in pred if type(x) != Compare][1] # right hand side # recursively compute the child string compare_str = self.iter_comp_tree(compare, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) lhs_str = self.iter_comp_tree(lhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) rhs_str = self.iter_comp_tree(rhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) # return formatted string if python_syntax: return "(({}) if ({}) else ({}))".format( lhs_str, compare_str, rhs_str) else: # C++ ternary operator syntax return "(({}) ? ({}) : ({}))".format(compare_str, lhs_str, rhs_str) elif isinstance(node, Compare): # comparison # extract expression element lhs = pred[0] rhs = pred[1] # recursively compute the child string lhs_str = self.iter_comp_tree(lhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) rhs_str = self.iter_comp_tree(rhs, index_relative_to_center, replace_negative_index, python_syntax, flatten_index, output_dimensions) # return formatted string return "{} {} {}".format(lhs_str, str(node.name), rhs_str) elif isinstance(node, UnaryOp): # unary operations e.g. negation # extract expression element expr = pred[0] # recursively compute the child string expr_str = self.iter_comp_tree( node=expr, index_relative_to_center=index_relative_to_center, replace_negative_index=replace_negative_index, python_syntax=python_syntax, flatten_index=flatten_index, output_dimensions=output_dimensions) # return formatted string return "({}{})".format(node.generate_op_sym(), expr_str) else: raise NotImplementedError( "iter_comp_tree is not implemented for node type {}".format( type(node)))
def compute_critical_path(self) -> int: """ Computes the max latency critical path through the graph in scalar format. """ return stencilflow.convert_3d_to_1d( index=self.compute_critical_path_dim(), dimensions=self.dimensions)
def compute_delay_buffer(self) -> None: """ Computes the delay buffer sizes in the graph by propagating all paths from the input arrays to the successors in topological order. Delay buffer entries should be of the format: kernel.input_paths:{ "in1": [[a,b,c, pred1], [d,e,f, pred2], ...], "in2": [ ... ], ... } where inX are input arrays to the stencil chain and predY are the kernel predecessors/inputs """ # get topological order for top-down walk through of the graph try: order = list(nx.topological_sort(self.graph)) except nx.exception.NetworkXUnfeasible: cycle = next(nx.algorithms.cycles.simple_cycles(self.graph)) raise ValueError("Cycle detected: {}".format( [c.name for c in cycle])) # go through all nodes for node in order: # process delay buffer (no additional delay buffer will appear because of the topological order) for inp in node.input_paths: # compute maximum delay size per input max_delay = max(node.input_paths[inp]) max_delay[ 2] += 1 # add an extra delay cycle for the processing in the kernel node # loop over all inputs and set their size relative to the max size to have data ready at the exact # same time for entry in node.input_paths[inp]: name = entry[-1] max_size = stencilflow.convert_3d_to_1d( dimensions=self.dimensions, index=stencilflow.list_subtract_cwise( max_delay[:-1], entry[:-1])) node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size) node.delay_buffer[name].import_data( [None] * node.delay_buffer[name].maxsize) # set input node delay buffers to 1 if isinstance(node, Input): node.delay_buffer = BoundedQueue(name=node.name, maxsize=1, collection=[None]) # propagate the path lengths (from input arrays over all ways) to the successors for succ in self.graph.successors(node): # add input node to all as direct input (=0 delay buffer) if isinstance(node, Input): # add emtpy list dictionary entry for enabling list append() if node.name not in succ.input_paths: succ.input_paths[node.name] = [] successor = [0] * len(self.dimensions) successor = successor + [node.name] succ.input_paths[node.name].append(successor) # add kernel node to all, but calculate the length first (predecessor + delay + internal, ..) elif isinstance(node, Kernel): # add KERNEL # add latency, internal_buffer, delay_buffer internal_buffer = [0] * 3 for item in node.graph.accesses: internal_buffer = max( node.graph.accesses[item] ) if KernelChainGraph.greater( max(node.graph.accesses[item]), internal_buffer) else internal_buffer # latency latency = self.kernel_nodes[node.name].graph.max_latency # compute delay buffer and create entry for entry in node.input_paths: # the first entry has to initialize the structure if entry not in succ.input_paths: succ.input_paths[entry] = [] # compute the actual delay buffer delay_buffer = max(node.input_paths[entry][:]) # merge them together total = [ i + d if i is not None else d for i, d in zip(internal_buffer, delay_buffer) ] # add the latency too total[-1] += latency total.append(node.name) # add entry to paths succ.input_paths[entry].append(total) else: # NodeType.OUTPUT: do nothing continue