def vectoroptimizer_unrolled(self, loop, unroll_factor=-1): opt = self.vectoroptimizer(loop) opt.linear_find_smallest_type(loop) loop.setup_vectorization() if unroll_factor == -1 and opt.smallest_type_bytes == 0: raise NotAVectorizeableLoop() if unroll_factor == -1: unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE) print "" print "unroll factor: ", unroll_factor, opt.smallest_type_bytes self.show_dot_graph(DependencyGraph(loop), "original_" + self.test_name) graph = opt.analyse_index_calculations(loop) if graph is not None: cycle = graph.cycles() if cycle is not None: print "CYCLE found %s" % cycle self.show_dot_graph(graph, "early_exit_" + self.test_name) assert cycle is None state = SchedulerState(graph) opt.schedule(state) opt.unroll_loop_iterations(loop, unroll_factor) self.debug_print_operations(loop) graph = DependencyGraph(loop) self.last_graph = graph # legacy for test_dependency self.show_dot_graph(graph, self.test_name) def gmr(i): return graph.memory_refs[graph.nodes[i]] graph.getmemref = gmr return opt, graph
def vectoroptimizer_unrolled(self, loop, unroll_factor = -1): opt = self.vectoroptimizer(loop) opt.linear_find_smallest_type(loop) loop.setup_vectorization() if unroll_factor == -1 and opt.smallest_type_bytes == 0: raise NotAVectorizeableLoop() if unroll_factor == -1: unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE) print "" print "unroll factor: ", unroll_factor, opt.smallest_type_bytes self.show_dot_graph(DependencyGraph(loop), "original_" + self.test_name) graph = opt.analyse_index_calculations(loop) if graph is not None: cycle = graph.cycles() if cycle is not None: print "CYCLE found %s" % cycle self.show_dot_graph(graph, "early_exit_" + self.test_name) assert cycle is None state = SchedulerState(graph) opt.schedule(state) opt.unroll_loop_iterations(loop, unroll_factor) self.debug_print_operations(loop) graph = DependencyGraph(loop) self.last_graph = graph # legacy for test_dependency self.show_dot_graph(graph, self.test_name) def gmr(i): return graph.memory_refs[graph.nodes[i]] graph.getmemref = gmr return opt, graph
def build_dependency(self, ops): loop = self.parse_loop(ops) graph = DependencyGraph(loop) self.show_dot_graph(graph, self.test_name) for node in graph.nodes: assert node.independent(node) graph.parsestr = ops return graph
def run_optimization(self, metainterp_sd, info, loop, jitcell_token, user_code): self.orig_label_args = loop.label.getarglist_copy() self.linear_find_smallest_type(loop) byte_count = self.smallest_type_bytes vsize = self.vector_ext.vec_size() # stop, there is no chance to vectorize this trace # we cannot optimize normal traces (if there is no label) if vsize == 0: debug_print("vector size is zero") raise NotAVectorizeableLoop if byte_count == 0: debug_print("could not find smallest type") raise NotAVectorizeableLoop if loop.label.getopnum() != rop.LABEL: debug_print("not a loop, can only vectorize loops") raise NotAVectorizeableLoop # find index guards and move to the earliest position graph = self.analyse_index_calculations(loop) if graph is not None: state = SchedulerState(metainterp_sd.cpu, graph) self.schedule(state) # reorder the trace # unroll self.unroll_count = self.get_unroll_count(vsize) align_unroll = self.unroll_count==1 and \ self.vector_ext.should_align_unroll self.unroll_loop_iterations(loop, self.unroll_count, align_unroll_once=align_unroll) # vectorize graph = DependencyGraph(loop) self.find_adjacent_memory_refs(graph) self.extend_packset() self.combine_packset() costmodel = GenericCostModel(self.cpu, self.cost_threshold) state = VecScheduleState(graph, self.packset, self.cpu, costmodel) self.schedule(state) if not state.profitable(): raise NotAProfitableLoop gso = GuardStrengthenOpt(graph.index_vars) gso.propagate_all_forward(info, loop, user_code) # re-schedule the trace -> removes many pure operations graph = DependencyGraph(loop) state = SchedulerState(self.cpu, graph) state.schedule() info.extra_before_label = loop.align_operations for op in loop.align_operations: op.set_forwarded(None) return loop.finaloplist(jitcell_token=jitcell_token, reset_label_token=False)
def run_optimization(self, info, loop): self.orig_label_args = loop.label.getarglist_copy() self.linear_find_smallest_type(loop) byte_count = self.smallest_type_bytes vsize = self.cpu.vector_register_size if vsize == 0 or byte_count == 0 or loop.label.getopnum() != rop.LABEL: # stop, there is no chance to vectorize this trace # we cannot optimize normal traces (if there is no label) raise NotAVectorizeableLoop() # find index guards and move to the earliest position graph = self.analyse_index_calculations(loop) if graph is not None: state = SchedulerState(graph) self.schedule(state) # reorder the trace # unroll self.unroll_count = self.get_unroll_count(vsize) self.unroll_loop_iterations(loop, self.unroll_count) # vectorize graph = DependencyGraph(loop) self.find_adjacent_memory_refs(graph) self.extend_packset() self.combine_packset() # TODO move cost model to CPU costmodel = X86_CostModel(self.cpu, self.cost_threshold) state = VecScheduleState(graph, self.packset, self.cpu, costmodel) self.schedule(state) if not state.profitable(): raise NotAProfitableLoop() return graph.index_vars
def vectorize(self, loop, unroll_factor=-1): info = FakeLoopInfo(loop) info.snapshot(loop) opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) opt.find_adjacent_memory_refs(graph) opt.extend_packset() opt.combine_packset() costmodel = GenericCostModel(self.cpu, 0) state = VecScheduleState(graph, opt.packset, self.cpu, costmodel) opt.schedule(state) if not costmodel.profitable(): raise NotAProfitableLoop() gso = GuardStrengthenOpt(graph.index_vars) gso.propagate_all_forward(info, loop) # # re-schedule graph = DependencyGraph(loop) state = SchedulerState(self.cpu, graph) state.prepare() Scheduler().walk_and_emit(state) state.post_schedule() # oplist = loop.operations loop.operations = loop.prefix[:] if loop.prefix_label: loop.operations += [loop.prefix_label] loop.operations += oplist return opt
def optguards(self, loop, user_code=False): info = FakeLoopInfo(loop) info.snapshot(loop) for op in loop.operations: if op.is_guard(): op.setdescr(compile.CompileLoopVersionDescr()) dep = DependencyGraph(loop) opt = GuardStrengthenOpt(dep.index_vars) opt.propagate_all_forward(info, loop, user_code) return opt
def test_delayed_schedule(self): loop = self.parse(""" [i0] i1 = int_add(i0,1) i2 = int_add(i0,1) jump(i2) """) loop.prefix_label = None loop.label = ResOperation(rop.LABEL, loop.inputargs) ops = loop.operations loop.operations = ops[:-1] loop.jump = ops[-1] state = SchedulerState(self.cpu, DependencyGraph(loop)) state.schedule() assert len(loop.operations) == 1
def schedule(self, loop, unroll_factor=-1, with_guard_opt=False): info = FakeLoopInfo(loop) info.snapshot(loop) opt, graph = self.vectoroptimizer_unrolled(loop, unroll_factor) opt.find_adjacent_memory_refs(graph) opt.extend_packset() opt.combine_packset() costmodel = FakeCostModel(self.cpu) state = VecScheduleState(graph, opt.packset, self.cpu, costmodel) opt.schedule(state) if with_guard_opt: gso = GuardStrengthenOpt(graph.index_vars) gso.propagate_all_forward(info, loop) # re-schedule graph = DependencyGraph(loop) state = SchedulerState(self.cpu, graph) state.prepare() Scheduler().walk_and_emit(state) state.post_schedule() return opt
def savings(self, loop): jitdriver_sd = FakeJitDriverStaticData() opt = VectorizingOptimizer(self.metainterp_sd, jitdriver_sd, 0) opt.orig_label_args = loop.label.getarglist()[:] graph = opt.dependency_graph = DependencyGraph(loop) self.show_dot_graph(graph, 'costmodel') for k, m in graph.memory_refs.items(): graph.memory_refs[k] = FakeMemoryRef(m.array, m.index_var) opt.find_adjacent_memory_refs(graph) opt.extend_packset() opt.combine_packset() for pack in opt.packset.packs: print "pack: \n ", print '\n '.join( [str(op.getoperation()) for op in pack.operations]) print costmodel = FakeCostModel(GenericCostModel(self.cpu, 0)) costmodel.reset_savings() state = VecScheduleState(graph, opt.packset, self.cpu, costmodel) opt.schedule(state) return costmodel.getsavings()
def analyse_index_calculations(self, loop): """ Tries to move guarding instructions an all the instructions that need to be computed for the guard to the loop header. This ensures that guards fail 'early' and relax dependencies. Without this step vectorization would not be possible! """ graph = DependencyGraph(loop) zero_deps = {} for node in graph.nodes: if node.depends_count() == 0: zero_deps[node] = 0 earlyexit = graph.imaginary_node("early exit") guards = graph.guards one_valid = False valid_guards = [] for guard_node in guards: modify_later = [] last_prev_node = None valid = True if guard_node in zero_deps: del zero_deps[guard_node] for prev_dep in guard_node.depends(): prev_node = prev_dep.to if prev_dep.is_failarg(): # remove this edge later. # 1) only because of failing, this dependency exists # 2) non pure operation points to this guard. # but if this guard only depends on pure operations, it can be checked # at an earlier position, the non pure op can execute later! modify_later.append(prev_node) else: for path in prev_node.iterate_paths(None, backwards=True, blacklist=True): if not path.is_always_pure(): valid = False else: if path.last() in zero_deps: del zero_deps[path.last()] if not valid: break if valid: # transformation is valid, modify the graph and execute # this guard earlier one_valid = True for node in modify_later: node.remove_edge_to(guard_node) # every edge that starts in the guard, the early exit # inherts the edge and guard then provides to early exit for dep in guard_node.provides()[:]: assert not dep.target_node().is_imaginary() earlyexit.edge_to(dep.target_node(), failarg=True) guard_node.remove_edge_to(dep.target_node()) valid_guards.append(guard_node) guard_node.edge_to(earlyexit) self.mark_guard(guard_node, loop) for node in zero_deps.keys(): assert not node.is_imaginary() earlyexit.edge_to(node) if one_valid: return graph return None