def test_add_tree_1(): root1 = stack_parser.exec_time_tree(raw_stack) root2 = stack_parser.exec_time_tree(raw_stack) sum_tree = add_tree(root1, root2) graph1 = draw_tree(root1) graph2 = draw_tree(root2) graph = draw_tree(sum_tree) assert graph1 == graph2 assert graph1 == """ digraph { 0 [ shape=record label = "time_in_roi|0.40"]; 1 [ shape=record label = "add|0.40"]; 2 [ shape=record label = "@CPU_LOG@|0.07"]; 3 [ shape=record label = "empty|0.01"]; 4 [ shape=record label = "add_stub|0.01"]; 5 [ shape=record label = "@HB_LOG@|0.23"]; 6 [ shape=record label = "empty|0.01"]; 7 [ shape=record label = "add_stub|0.19"]; 8 [ shape=record label = "@OFFLOAD_KERNEL@__tensorlib_add|0.14"]; 10 [ shape=record label = "llcopy|0.03"]; 11 [ shape=record label = "other|0.00"]; 0 -> 1; 0 -> 11; 1 -> 2; 1 -> 5; 1 -> 10; 2 -> 3; 2 -> 4; 5 -> 6; 5 -> 7; 7 -> 8; } """ assert graph == """
def compare_impl(full_raw_stack, full_actuals, chunk_raw_stack, chunk_actuals, fancy_func=False, external_trim=None): # cross check both stacks -- make sure they have the same shape cross_check(full_raw_stack, chunk_raw_stack) # build a full tree -- just to get the aten op name root = stack_parser.exec_time_tree(full_raw_stack, fancy_func=fancy_func) # this must be true -- the root has 2 children -- the aten op and other op_name = root.children[0].func # process CPU_log and HB_log # CPU_log should be given by full input data # HB_log should be given by chunk input data cpu_log = process_CPU_stack.parse(full_raw_stack, fancy_func=fancy_func) hb_log = process_HB_stack.parse(chunk_raw_stack, fancy_func=fancy_func, trimming=True) # re-apply trim of 0 if using external tirmming is enabled # since we can have more than one @TRIM@ node, we cannot just adjust these nodes if external_trim is not None: def reset_trim(root): if root.func == "@TRIM@": root.time = float(0) hb_log = traversal(hb_log, reset_trim) stack_parser.exec_time_apply_trim(hb_log) # get total time on device # so we accumulate all simualted time total_time_on_HB = 0 if external_trim is None: def acc_trim(root): nonlocal total_time_on_HB if root.func == "@TRIM@": total_time_on_HB += root.time traversal(hb_log, acc_trim) else: total_time_on_HB = external_trim # debug print(stack_parser.exec_time_print_tree(cpu_log)) print(stack_parser.exec_time_print_tree(hb_log)) print("total time on HB = " + str(total_time_on_HB)) # process input tensors actuals = actual_parser.parse(full_actuals, chunk_actuals) return ATen_OP(op_name, cpu_log, hb_log, total_time_on_HB, actuals)
def cross_check(full_raw_stack, chunk_raw_stack): full_tree = stack_parser.exec_time_tree(full_raw_stack) chunk_tree = stack_parser.exec_time_tree(chunk_raw_stack) full_func_list = [] def gather_full_func(root): full_func_list.append(root.func) chunk_func_list = [] def gather_chunk_func(root): chunk_func_list.append(root.func) traversal(full_tree, gather_full_func) traversal(chunk_tree, gather_chunk_func) assert len(full_func_list) == len(chunk_func_list) full_func = "<|>".join(full_func_list) chunk_func = "<|>".join(chunk_func_list) assert full_func == chunk_func
def parse(raw_stack, fancy_func=False): root = stack_parser.exec_time_tree(raw_stack, fancy_func=fancy_func) # in the per ATen OP context, the tree looks like # root # / \ # aten::op other assert len(root.children) == 2 assert root.children[1].func == "other" aten_op = root.children[0] # look for @CPU_LOG@ # there has to be a CPU_log to use this parser # the aten_op tree should look like # aten::op # / | \ # CPU_log ****** HB_log cpu_log = None for child in aten_op.children: if child.func == "@CPU_LOG@": cpu_log = child break assert cpu_log is not None return cpu_log
def test_stack_parsing_1(): root = stack_parser.exec_time_tree(raw_stack) graph = stack_parser.exec_time_print_tree(root) assert graph == """|- Node(time_in_roi : 0.399)\n |- Node(at::Tensor at::CPUType::{anonymous}::add(const at::Tensor&, const at::Tensor&, c10::Scalar) : 0.399)\n |- Node(@CPU_LOG@ : 0.067)\n |- Node(at::Tensor at::CPUType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.015)\n |- Node(at::native::add_stub::add_stub() : 0.009)\n |- Node(@HB_LOG@ : 0.234)\n |- Node(at::Tensor at::HammerBladeType::{anonymous}::empty(c10::IntArrayRef, const c10::TensorOptions&, c10::optional<c10::MemoryFormat>) : 0.01)\n |- Node(at::native::add_stub::add_stub() : 0.187)\n |- Node(@OFFLOAD_KERNEL@__tensorlib_add : 0.145)\n |- Node(@TRIM@ : 0.0)\n |- Node(at::Tensor at::CPUType::{anonymous}::llcopy(const at::Tensor&) : 0.025)\n |- Node(other : 0.0)"""