def unroll_loop_iterations(self, loop, unroll_count): """ Unroll the loop X times. unroll_count + 1 = unroll_factor """ numops = len(loop.operations) renamer = Renamer() operations = loop.operations unrolled = [] prohibit_opnums = (rop.GUARD_FUTURE_CONDITION, rop.GUARD_NOT_INVALIDATED) orig_jump_args = loop.jump.getarglist()[:] # it is assumed that #label_args == #jump_args label_arg_count = len(orig_jump_args) for u in range(unroll_count): # fill the map with the renaming boxes. keys are boxes from the label for i in range(label_arg_count): la = loop.label.getarg(i) ja = loop.jump.getarg(i) ja = renamer.rename_box(ja) if la != ja: renamer.start_renaming(la, ja) # for i, op in enumerate(operations): if op.getopnum() in prohibit_opnums: continue # do not unroll this operation twice copied_op = copy_resop(op) if not copied_op.returns_void(): # every result assigns a new box, thus creates an entry # to the rename map. renamer.start_renaming(op, copied_op) # args = copied_op.getarglist() for a, arg in enumerate(args): value = renamer.rename_box(arg) copied_op.setarg(a, value) # not only the arguments, but also the fail args need # to be adjusted. rd_snapshot stores the live variables # that are needed to resume. if copied_op.is_guard(): self.copy_guard_descr(renamer, copied_op) # unrolled.append(copied_op) # the jump arguments have been changed # if label(iX) ... jump(i(X+1)) is called, at the next unrolled loop # must look like this: label(i(X+1)) ... jump(i(X+2)) args = loop.jump.getarglist() for i, arg in enumerate(args): value = renamer.rename_box(arg) loop.jump.setarg(i, value) # loop.operations = operations + unrolled
class SchedulerState(object): def __init__(self, cpu, graph): self.cpu = cpu self.renamer = Renamer() self.graph = graph self.oplist = [] self.worklist = [] self.invariant_oplist = [] self.invariant_vector_vars = [] self.seen = {} self.delayed = [] def resolve_delayed(self, needs_resolving, delayed, op): # recursive solving of all delayed objects if not delayed: return args = op.getarglist() if op.is_guard(): args = args[:] + op.getfailargs() for arg in args: if arg is None or arg.is_constant() or arg.is_inputarg(): continue if arg not in self.seen: box = self.renamer.rename_box(arg) needs_resolving[box] = None indexvars = self.graph.index_vars i = len(delayed) - 1 while i >= 0: node = delayed[i] op = node.getoperation() if op in needs_resolving: # either it is a normal operation, or we know that there is a linear combination del needs_resolving[op] if op in indexvars: opindexvar = indexvars[op] # there might be a variable already, that # calculated the index variable, thus just reuse it for var, indexvar in indexvars.items(): if indexvar == opindexvar and var in self.seen: self.renamer.start_renaming(op, var) break else: if opindexvar.calculated_by(op): # just append this operation self.seen[op] = None self.append_to_oplist(op) else: # here is an easier way to calculate just this operation last = op for operation in opindexvar.get_operations(): self.append_to_oplist(operation) last = operation indexvars[last] = opindexvar self.renamer.start_renaming(op, last) self.seen[op] = None self.seen[last] = None else: self.resolve_delayed(needs_resolving, delayed, op) self.append_to_oplist(op) self.seen[op] = None if len(delayed) > i: del delayed[i] i -= 1 # some times the recursive call can remove several items from delayed, # thus we correct the index here if len(delayed) <= i: i = len(delayed) - 1 def append_to_oplist(self, op): self.renamer.rename(op) self.oplist.append(op) def schedule(self): self.prepare() Scheduler().walk_and_emit(self) self.post_schedule() def post_schedule(self): loop = self.graph.loop jump = loop.jump if self.delayed: # some operations can be delayed until the jump instruction, # handle them here self.resolve_delayed({}, self.delayed, jump) self.renamer.rename(jump) loop.operations = self.oplist def profitable(self): return True def prepare(self): for node in self.graph.nodes: if node.depends_count() == 0: self.worklist.insert(0, node) def try_emit_or_delay(self, node): if not node.is_imaginary() and node.is_pure(): # this operation might never be emitted. only if it is really needed self.delay_emit(node) return # emit a now! self.pre_emit(node, True) self.mark_emitted(node) if not node.is_imaginary(): op = node.getoperation() self.seen[op] = None self.append_to_oplist(op) def delay_emit(self, node): """ it has been decided that the operation might be scheduled later """ delayed = node.delayed or [] if node not in delayed: delayed.append(node) node.delayed = None provides = node.provides() if len(provides) == 0: for n in delayed: self.delayed.append(n) else: for to in node.provides(): tnode = to.target_node() self.delegate_delay(tnode, delayed[:]) self.mark_emitted(node) def delegate_delay(self, node, delayed): """ Chain up delays, this can reduce many more of the operations """ if node.delayed is None: node.delayed = delayed else: delayedlist = node.delayed for d in delayed: if d not in delayedlist: delayedlist.append(d) def mark_emitted(state, node, unpack=True): """ An operation has been emitted, adds new operations to the worklist whenever their dependency count drops to zero. Keeps worklist sorted (see priority) """ worklist = state.worklist provides = node.provides()[:] for dep in provides: # COPY target = dep.to node.remove_edge_to(target) if not target.emitted and target.depends_count() == 0: # sorts them by priority i = len(worklist) - 1 while i >= 0: cur = worklist[i] c = (cur.priority - target.priority) if c < 0: # meaning itnode.priority < target.priority: worklist.insert(i + 1, target) break elif c == 0: # if they have the same priority, sort them # using the original position in the trace if target.getindex() < cur.getindex(): worklist.insert(i + 1, target) break i -= 1 else: worklist.insert(0, target) node.clear_dependencies() node.emitted = True if not node.is_imaginary(): op = node.getoperation() state.renamer.rename(op) if unpack: state.ensure_args_unpacked(op) state.post_emit(node) def delay(self, node): return False def has_more(self): return len(self.worklist) > 0 def ensure_args_unpacked(self, op): pass def post_emit(self, node): pass def pre_emit(self, orignode, pack_first=True): delayed = orignode.delayed if delayed: # there are some nodes that have been delayed just for this operation if pack_first: op = orignode.getoperation() self.resolve_delayed({}, delayed, op) for node in delayed: op = node.getoperation() if op in self.seen: continue if node is not None: provides = node.provides() if len(provides) == 0: # add this node to the final delay list # might be emitted before jump! self.delayed.append(node) else: for to in node.provides(): tnode = to.target_node() self.delegate_delay(tnode, [node]) orignode.delayed = None
def unroll_loop_iterations(self, loop, unroll_count, align_unroll_once=False): """ Unroll the loop `unroll_count` times. There can be an additional unroll step if alignment might benefit """ numops = len(loop.operations) renamer = Renamer() operations = loop.operations orig_jump_args = loop.jump.getarglist()[:] prohibit_opnums = (rop.GUARD_FUTURE_CONDITION, rop.GUARD_NOT_INVALIDATED, rop.DEBUG_MERGE_POINT) unrolled = [] if align_unroll_once: unroll_count += 1 # it is assumed that #label_args == #jump_args label_arg_count = len(orig_jump_args) label = loop.label jump = loop.jump new_label = loop.label for u in range(unroll_count): # fill the map with the renaming boxes. keys are boxes from the label for i in range(label_arg_count): la = label.getarg(i) ja = jump.getarg(i) ja = renamer.rename_box(ja) if la != ja: renamer.start_renaming(la, ja) # for i, op in enumerate(operations): if op.getopnum() in prohibit_opnums: continue # do not unroll this operation twice copied_op = copy_resop(op) if not copied_op.returns_void(): # every result assigns a new box, thus creates an entry # to the rename map. renamer.start_renaming(op, copied_op) # args = copied_op.getarglist() for a, arg in enumerate(args): value = renamer.rename_box(arg) copied_op.setarg(a, value) # not only the arguments, but also the fail args need # to be adjusted. rd_snapshot stores the live variables # that are needed to resume. if copied_op.is_guard(): self.copy_guard_descr(renamer, copied_op) # unrolled.append(copied_op) # if align_unroll_once and u == 0: descr = label.getdescr() args = label.getarglist()[:] new_label = ResOperation(rop.LABEL, args, descr) renamer.rename(new_label) # # the jump arguments have been changed # if label(iX) ... jump(i(X+1)) is called, at the next unrolled loop # must look like this: label(i(X+1)) ... jump(i(X+2)) args = loop.jump.getarglist() for i, arg in enumerate(args): value = renamer.rename_box(arg) loop.jump.setarg(i, value) # loop.label = new_label if align_unroll_once: loop.align_operations = operations loop.operations = unrolled else: loop.operations = operations + unrolled
class SchedulerState(object): def __init__(self, cpu, graph): self.cpu = cpu self.renamer = Renamer() self.graph = graph self.oplist = [] self.worklist = [] self.invariant_oplist = [] self.invariant_vector_vars = [] self.seen = {} self.delayed = [] def resolve_delayed(self, needs_resolving, delayed, op): # recursive solving of all delayed objects if not delayed: return args = op.getarglist() if op.is_guard(): args = args[:] + op.getfailargs() for arg in args: if arg is None or arg.is_constant() or arg.is_inputarg(): continue if arg not in self.seen: box = self.renamer.rename_box(arg) needs_resolving[box] = None indexvars = self.graph.index_vars i = len(delayed)-1 while i >= 0: node = delayed[i] op = node.getoperation() if op in needs_resolving: # either it is a normal operation, or we know that there is a linear combination del needs_resolving[op] if op in indexvars: opindexvar = indexvars[op] # there might be a variable already, that # calculated the index variable, thus just reuse it for var, indexvar in indexvars.items(): if indexvar == opindexvar and var in self.seen: self.renamer.start_renaming(op, var) break else: if opindexvar.calculated_by(op): # just append this operation self.seen[op] = None self.append_to_oplist(op) else: # here is an easier way to calculate just this operation last = op for operation in opindexvar.get_operations(): self.append_to_oplist(operation) last = operation indexvars[last] = opindexvar self.renamer.start_renaming(op, last) self.seen[op] = None self.seen[last] = None else: self.resolve_delayed(needs_resolving, delayed, op) self.append_to_oplist(op) self.seen[op] = None if len(delayed) > i: del delayed[i] i -= 1 # some times the recursive call can remove several items from delayed, # thus we correct the index here if len(delayed) <= i: i = len(delayed)-1 def append_to_oplist(self, op): self.renamer.rename(op) self.oplist.append(op) def schedule(self): self.prepare() Scheduler().walk_and_emit(self) self.post_schedule() def post_schedule(self): loop = self.graph.loop jump = loop.jump if self.delayed: # some operations can be delayed until the jump instruction, # handle them here self.resolve_delayed({}, self.delayed, jump) self.renamer.rename(jump) loop.operations = self.oplist def profitable(self): return True def prepare(self): for node in self.graph.nodes: if node.depends_count() == 0: self.worklist.insert(0, node) def try_emit_or_delay(self, node): if not node.is_imaginary() and node.is_pure(): # this operation might never be emitted. only if it is really needed self.delay_emit(node) return # emit a now! self.pre_emit(node, True) self.mark_emitted(node) if not node.is_imaginary(): op = node.getoperation() self.seen[op] = None self.append_to_oplist(op) def delay_emit(self, node): """ it has been decided that the operation might be scheduled later """ delayed = node.delayed or [] if node not in delayed: delayed.append(node) node.delayed = None provides = node.provides() if len(provides) == 0: for n in delayed: self.delayed.append(n) else: for to in node.provides(): tnode = to.target_node() self.delegate_delay(tnode, delayed[:]) self.mark_emitted(node) def delegate_delay(self, node, delayed): """ Chain up delays, this can reduce many more of the operations """ if node.delayed is None: node.delayed = delayed else: delayedlist = node.delayed for d in delayed: if d not in delayedlist: delayedlist.append(d) def mark_emitted(state, node, unpack=True): """ An operation has been emitted, adds new operations to the worklist whenever their dependency count drops to zero. Keeps worklist sorted (see priority) """ worklist = state.worklist provides = node.provides()[:] for dep in provides: # COPY target = dep.to node.remove_edge_to(target) if not target.emitted and target.depends_count() == 0: # sorts them by priority i = len(worklist)-1 while i >= 0: cur = worklist[i] c = (cur.priority - target.priority) if c < 0: # meaning itnode.priority < target.priority: worklist.insert(i+1, target) break elif c == 0: # if they have the same priority, sort them # using the original position in the trace if target.getindex() < cur.getindex(): worklist.insert(i+1, target) break i -= 1 else: worklist.insert(0, target) node.clear_dependencies() node.emitted = True if not node.is_imaginary(): op = node.getoperation() state.renamer.rename(op) if unpack: state.ensure_args_unpacked(op) state.post_emit(node) def delay(self, node): return False def has_more(self): return len(self.worklist) > 0 def ensure_args_unpacked(self, op): pass def post_emit(self, node): pass def pre_emit(self, orignode, pack_first=True): delayed = orignode.delayed if delayed: # there are some nodes that have been delayed just for this operation if pack_first: op = orignode.getoperation() self.resolve_delayed({}, delayed, op) for node in delayed: op = node.getoperation() if op in self.seen: continue if node is not None: provides = node.provides() if len(provides) == 0: # add this node to the final delay list # might be emitted before jump! self.delayed.append(node) else: for to in node.provides(): tnode = to.target_node() self.delegate_delay(tnode, [node]) orignode.delayed = None