def test_filter_squash_bunny_to_goat_with_merge(): r"""Test squash on a "bunny" shaped graph: This one is more complex because there are more transitive edges to maintain between the roots (e, g) and b and c. e g / \ / \ f a h remove ac e g / \ ----------> / \ / \ b c f b h \ / b """ b = Node(Frame(name="b")) diamond = Node.from_lists(("a", ("b", b), ("c", b))) new_b = Node(Frame(name="b")) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), Graph.from_lists(("e", new_b, "f"), ("g", new_b, "h")), [4, 2, 1, 4, 1], # e, b, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), 5, # e, b, f, g, h )
def test_traverse_pre(): node = Node(Frame(name="a")) assert list(node.traverse(attrs="name")) == ["a"] node = Node.from_lists(["a", ["b", "d", "e"], ["c", "f", "g"]]) assert list( node.traverse(attrs="name")) == ["a", "b", "d", "e", "c", "f", "g"]
def test_filter_squash_bunny(): r"""Test squash on a complicated "bunny" shaped graph. This has multiple roots as well as multiple parents that themselves have parents. e g / \ / \ f a h remove abc e g / \ -----------> / \ / \ b c f d h \ / d """ d = Node(Frame(name="d")) diamond = Node.from_lists(("a", ("b", d), ("c", d))) new_d = Node(Frame(name="d")) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "b", "c"), Graph.from_lists(("e", new_d, "f"), ("g", new_d, "h")), [3, 1, 1, 3, 1], # e, d, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "b", "c"), 5, # e, d, f, g, h )
def check_dag_equal(): chain = Node.from_lists(("a", ("b", ("c", ("d", ))))) d = Node(Frame(name="d")) diamond = Node.from_lists(("a", ("b", d), ("c", d))) tree = Node.from_lists(("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g"))) assert chain.dag_equal(chain) assert chain.dag_equal(chain.copy()) assert diamond.dag_equal(diamond) assert diamond.dag_equal(diamond.copy()) assert tree.dag_equal(tree) assert tree.dag_equal(tree.copy()) assert not chain.dag_equal(tree) assert not chain.dag_equal(diamond) assert not tree.dag_equal(chain) assert not tree.dag_equal(diamond) assert not diamond.dag_equal(chain) assert not diamond.dag_equal(tree)
def test_filter_squash_bunny_to_goat(): r"""Test squash on a "bunny" shaped graph: This one is more complex because there are more transitive edges to maintain between the roots (e, g) and b and c. e g e g / \ / \ /|\ /|\ f a h remove ac f | b | h / \ ----------> | | | b c \|/ \ / d d """ d = Node(Frame(name="d")) diamond = Node.from_lists(("a", ("b", d), ("c", d))) new_d = Node(Frame(name="d")) new_b = Node.from_lists(("b", new_d)) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), Graph.from_lists(("e", new_b, new_d, "f"), ("g", new_b, new_d, "h")), [4, 2, 1, 1, 4, 1], # e, b, d, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), 6, # e, b, d, f, g, h )
def test_copy(): d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert g.copy() == g
def test_traverse_paths(): d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert list( g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
def test_from_lists(): """Ensure we can traverse roots in correct order without repeating a shared subdag. """ d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert list(g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
def test_dag_is_not_tree(): g = Graph.from_lists(("b", "c"), ("d", "e")) assert not g.is_tree() d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph([diamond_subdag]) assert not g.is_tree() g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert not g.is_tree()
def __create_graph(self, rank, thread, list_roots, node_dicts): graph_data = self.dict["ranks"][str(rank)]["threads"][str( thread)]["regions"] for region, data in iter(graph_data.items()): # print(region, data) frame = Frame({"type": "region", "name": data["name"]}) node = Node(frame, None) contain_read_events = [0] metrics = self.__get_metrics(data, contain_read_events) node_dict = dict({ "name": data["name"], "node": node, "rank": int(rank), "thread": int(thread), **metrics, }) node_dicts.append(node_dict) # used to find node using parent_region_id self.node_graph_dict[int(region)] = [data["name"], node] if int(data["parent_region_id"]) == -1: list_roots.append(node) else: self.__add_child_node(int(data["parent_region_id"]), node) # check if we have to create child nodes for read events if contain_read_events[0] == 1: # check how many read calls are used read_num = len(data["cycles"]) for i in range(1, read_num): node_name_read = "read_" + str(i) read_frame = Frame({ "type": "region", "name": node_name_read }) read_node = Node(read_frame, node) read_metrics = self.__get_read_metrics( data, node_name_read) node_dict = dict({ "name": node_name_read, "node": read_node, "rank": int(rank), "thread": int(thread), **read_metrics, }) node_dicts.append(node_dict) node.add_child(read_node)
def test_path(): d = Node(Frame(name="d")) node = Node.from_lists(["a", ["b", d]]) assert d.path() == (Frame(name="a"), Frame(name="b"), Frame(name="d")) assert d.parents[0].path() == (Frame(name="a"), Frame(name="b")) assert node.path() == (Frame(name="a"), ) assert d.path(attrs="name") == ("a", "b", "d") assert d.parents[0].path(attrs="name") == ("a", "b") assert node.path(attrs="name") == ("a", )
def test_path(): d = Node(Frame(name="d", type="function")) node = Node.from_lists(["a", ["b", d]]) assert d.path() == ( Node(Frame(name="a")), Node(Frame(name="b")), Node(Frame(name="d", type="function")), ) assert d.parents[0].path() == (Node(Frame(name="a")), Node(Frame(name="b"))) assert node.path() == (Node(Frame(name="a")), )
def test_from_lists(): node = Node.from_lists("a") assert node.frame == Frame(name="a") a = Frame(name="a") b = Frame(name="b") c = Frame(name="c") node = Node.from_lists(["a", ["b", "c"]]) assert node.frame == a assert node.children[0].frame == b assert node.children[0].children[0].frame == c
def _create_parent(child_node, parent_callpath): """In TAU output, sometimes we see a node as a parent in the callpath before we see it as a leaf node. In this case, we need to create a hatchet node for the parent. We can't create a node_dict for the parent because we don't know its metric values when we first see it in a callpath. Example: a => b => c "<c_metric_values>" Here, if we haven't seen 'b' before, we should create it when we create 'c'. This function recursively creates parent nodes in a callpath until it reaches the already existing parent in that callpath. """ parent_node = self.callpath_to_node.get(parent_callpath) # Return if arrives to the parent # Else create a parent and add parent/child if parent_node is not None: parent_node.add_child(child_node) child_node.add_parent(parent_node) return else: grand_parent_callpath = parent_callpath[:-1] parent_info = parent_callpath[-1] parent_name = "" if " C " in parent_info: parent_name = _get_name_file_module( True, parent_info, " C ")[0] elif " [@] " in parent_info: parent_name = _get_name_file_module( True, parent_info, " [@] ")[0] else: parent_name = _get_name_file_module(True, parent_info, "")[0] parent_node = Node( Frame({ "type": "function", "name": parent_name }), None) self.callpath_to_node[parent_callpath] = parent_node parent_node.add_child(child_node) child_node.add_parent(parent_node) _create_parent(parent_node, grand_parent_callpath)
def create_graph(self): list_roots = [] global unknown_label_counter # find nodes in the nodes section that represent the path hierarchy for idx, node in enumerate(self.json_nodes): node_label = node["label"] if node_label == "": node_label = "UNKNOWN " + str(unknown_label_counter) unknown_label_counter += 1 self.idx_to_label[idx] = node_label if node["column"] == self.path_col_name: if "parent" not in node: # since this node does not have a parent, this is a root graph_root = Node( Frame({ "type": self.node_type, "name": node_label }), None) list_roots.append(graph_root) node_dict = { self.nid_col_name: idx, "name": node_label, "node": graph_root, } self.idx_to_node[idx] = node_dict else: parent_hnode = (self.idx_to_node[node["parent"]])["node"] hnode = Node( Frame({ "type": self.node_type, "name": node_label }), parent_hnode, ) parent_hnode.add_child(hnode) node_dict = { self.nid_col_name: idx, "name": node_label, "node": hnode, } self.idx_to_node[idx] = node_dict return list_roots
def test_filter_squash_diamond(): r"""Test that diamond edges are collapsed when squashing. Ensure we can handle the most basic DAG. a / \ remove bc a b c ----------> | \ / d d """ d = Node(Frame(name="d")) check_filter_squash( GraphFrame.from_lists(("a", ("b", d), ("c", d))), lambda row: row["node"].frame["name"] not in ("b", "c"), Graph.from_lists(("a", "d")), [2, 1], # a, d ) check_filter_no_squash( GraphFrame.from_lists(("a", ("b", d), ("c", d))), lambda row: row["node"].frame["name"] not in ("b", "c"), 2, # a, d )
def parse_node_literal(child_dict, hparent): """Create node_dict for one node and then call the function recursively on all children.""" hnode = Node( Frame({ "name": child_dict["function"], "type": "function" }), hparent) child_node_dict = { "node": hnode, "name": child_dict["function"], "file": child_dict["file_path_short"], "line": child_dict["line_no"], "time": child_dict["time"], "time (inc)": child_dict["time"], "is_application_code": child_dict["is_application_code"], } hparent.add_child(hnode) self.node_dicts.append(child_node_dict) if "children" in child_dict: for child in child_dict["children"]: # Pyinstrument's time metric actually stores inclusive time. # To calculate exclusive time, we subtract the children's time # from the parent's time. child_node_dict["time"] -= child["time"] parse_node_literal(child, hnode)
def test_union_dag(): # make graphs g1, g2, and g3, where you know g3 is the union of g1 and g2 c = Node.from_lists(("c", "d")) g1 = Graph.from_lists(("a", ("b", c), ("e", c, "f"))) d = Node(Frame(name="d")) g2 = Graph.from_lists(("a", ("b", ("c", d)), ("e", d, "f"))) d2 = Node(Frame(name="d")) c2 = Node.from_lists(("c", d2)) g3 = Graph.from_lists(("a", ("b", c2), ("e", c2, d2, "f"))) assert g1 != g2 g4 = g1.union(g2) assert g4 == g3
def read(self): list_roots = [] node_dicts = [] frame_to_node_dict = {} frame = None seen_nids = [] hnid = -1 # start with creating a node_dict for each root for i in range(len(self.graph_dict)): if "_hatchet_nid" in self.graph_dict[i]["metrics"]: hnid = self.graph_dict[i]["metrics"]["_hatchet_nid"] seen_nids.append(hnid) frame = Frame(self.graph_dict[i]["frame"]) graph_root = Node(frame, None, hnid=hnid) # depending on the node type, the name may not be in the frame node_name = self.graph_dict[i]["frame"].get("name") if not node_name: node_name = self.graph_dict[i]["name"] node_dict = dict({ "node": graph_root, "name": node_name }, **self.graph_dict[i]["metrics"]) node_dicts.append(node_dict) list_roots.append(graph_root) frame_to_node_dict[frame] = graph_root # call recursively on all children of root if "children" in self.graph_dict[i]: for child in self.graph_dict[i]["children"]: self.parse_node_literal(frame_to_node_dict, node_dicts, child, graph_root, seen_nids) graph = Graph(list_roots) # test if nids are already loaded if -1 in [n._hatchet_nid for n in graph.traverse()]: graph.enumerate_traverse() else: graph.enumerate_depth() exc_metrics = [] inc_metrics = [] for key in self.graph_dict[i]["metrics"].keys(): if "(inc)" in key: inc_metrics.append(key) else: exc_metrics.append(key) dataframe = pd.DataFrame(data=node_dicts) dataframe.set_index(["node"], inplace=True) dataframe.sort_index(inplace=True) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def parse_node_literal(self, frame_to_node_dict, node_dicts, child_dict, hparent): """Create node_dict for one node and then call the function recursively on all children. """ frame = Frame(child_dict["frame"]) if "duplicate" not in child_dict: hnode = Node(frame, hparent) # depending on the node type, the name may not be in the frame node_name = child_dict["frame"].get("name") if not node_name: node_name = child_dict["name"] node_dict = dict({ "node": hnode, "name": node_name }, **child_dict["metrics"]) node_dicts.append(node_dict) frame_to_node_dict[frame] = hnode elif "duplicate" in child_dict: hnode = frame_to_node_dict.get(frame) if not hnode: hnode = Node(frame, hparent) # depending on the node type, the name may not be in the frame node_name = child_dict["frame"].get("name") if not node_name: node_name = child_dict["name"] node_dict = dict({ "node": hnode, "name": node_name }, **child_dict["metrics"]) node_dicts.append(node_dict) frame_to_node_dict[frame] = hnode hparent.add_child(hnode) if "children" in child_dict: for child in child_dict["children"]: self.parse_node_literal(frame_to_node_dict, node_dicts, child, hnode)
def test_paths(): d = Node(Frame(name="d")) Node.from_lists(["a", ["b", d], ["c", d]]) with pytest.raises(MultiplePathError): d.path() assert d.paths() == [ (Frame(name="a"), Frame(name="b"), Frame(name="d")), (Frame(name="a"), Frame(name="c"), Frame(name="d")), ] assert d.paths(attrs="name") == [("a", "b", "d"), ("a", "c", "d")]
def test_traverse_post(): node = Node.from_lists(["a", ["b", "d", "e"], ["c", "f", "g"]]) assert list(node.traverse(order="post", attrs="name")) == [ "d", "e", "b", "f", "g", "c", "a", ]
def test_to_literal_node_ids(): r"""Test to_literal and from_literal with ids on a graph with cycles, multiple parents and children. a -- / \ / b c \ / d / \ e f """ a = Node(Frame(name="a")) d = Node(Frame(name="d")) gf = GraphFrame.from_lists([a, ["b", [d]], ["c", [d, ["e"], ["f"]], [a]]]) lit_list = gf.to_literal() gf2 = gf.from_literal(lit_list) lit_list2 = gf2.to_literal() assert lit_list == lit_list2
def parse_node_literal( self, frame_to_node_dict, node_dicts, child_dict, hparent, seen_nids ): """Create node_dict for one node and then call the function recursively on all children. """ # pull out _hatchet_nid if it exists # so it will not be inserted into # dataframe like a normal metric hnid = -1 if "_hatchet_nid" in child_dict["metrics"]: hnid = child_dict["metrics"]["_hatchet_nid"] frame = Frame(child_dict["frame"]) if hnid not in seen_nids: hnode = Node(frame, hparent, hnid=hnid) # depending on the node type, the name may not be in the frame node_name = child_dict["frame"].get("name") if not node_name: node_name = child_dict["name"] node_dict = dict( {"node": hnode, "name": node_name}, **child_dict["metrics"] ) node_dicts.append(node_dict) frame_to_node_dict[frame] = hnode if hnid != -1: seen_nids.append(hnid) else: hnode = frame_to_node_dict.get(frame) hparent.add_child(hnode) if "children" in child_dict: for child in child_dict["children"]: self.parse_node_literal( frame_to_node_dict, node_dicts, child, hnode, seen_nids )
def _create_node_and_row(self, fn_data, fn_name, stats_dict): """ Description: Takes a profiled function as specified in a pstats file and creates a node for it and adds a new line of metadata to our dataframe if it does not exist. """ u_fn_name = "{}:{}:{}".format( fn_name, fn_data[NameData.FILE].split("/")[-1], fn_data[NameData.LINE], ) fn_hnode = self.name_to_hnode.get(u_fn_name) if not fn_hnode: # create a node if it doesn't exist yet fn_hnode = Node(Frame({"type": "function", "name": fn_name}), None) self.name_to_hnode[u_fn_name] = fn_hnode # lookup stat data for source here fn_stats = stats_dict[fn_data] self._add_node_metadata(u_fn_name, fn_data, fn_stats, fn_hnode) return fn_hnode
def test_output_with_cycle_graphs(): r"""Test three output modes on a graph with cycles, multiple parents and children. a -- / \ / b c \ / d / \ e f """ dot_edges = [ # d has two parents and two children '"1" -> "2";', '"5" -> "2";', '"2" -> "3";', '"2" -> "4";', # a -> c -> a cycle '"0" -> "5";', '"5" -> "0";', ] a = Node(Frame(name="a")) d = Node(Frame(name="d")) gf = GraphFrame.from_lists([a, ["b", [d]], ["c", [d, ["e"], ["f"]], [a]]]) lit_list = gf.to_literal() treeout = gf.tree() dotout = gf.to_dot() # scan through litout produced dictionary for edges a_children = [n["frame"]["name"] for n in lit_list[0]["children"]] a_c_children = [n["frame"]["name"] for n in lit_list[0]["children"][1]["children"]] a_b_children = [n["frame"]["name"] for n in lit_list[0]["children"][0]["children"]] assert len(lit_list) == 1 assert len(a_children) == 2 # a -> (b,c) assert "b" in a_children assert "c" in a_children # a -> c -> a cycle assert "a" in a_c_children # d has two parents assert "d" in a_c_children assert "d" in a_b_children # check certain edges are in dot for edge in dot_edges: assert edge in dotout # check that a certain number of occurences # of same node are in tree indicating multiple # edges assert treeout.count("a") == 2 assert treeout.count("d") == 2 assert treeout.count("e") == 1 assert treeout.count("f") == 1
def create_graph(self): def _get_name_file_module(is_parent, node_info, symbol): """This function gets the name, file and module information for a node using the corresponding line in the output file. Example line: [UNWIND] <file> [@] <name> [{<file_or_module>} {<line>}] There are several line formats in TAU and this function gets the node information considering all these formats for which examples are given below. """ name, file, module = None, None, None # There are several different formats in TAU outputs. # There might be file, line, and module information. # The following if-else block covers all possible output # formats. Example formats are given in comments. if symbol == " [@] ": # Check if there is a [@] symbol. node_info = node_info.split(symbol) # We don't need file and module information if it's a parent node. if not is_parent: file = node_info[0].split()[1] if "[{" in node_info[1]: # Sometimes we see file and module information inside of [{}] # Example: [UNWIND] <file> [@] <name> [{<file_or_module>} {<line>}] name_and_module = node_info[1].split(" [{") module = name_and_module[1].split()[0].strip("}") else: # Example: [UNWIND] <file> [@] <name> <module> name_and_module = node_info[1].split() module = name_and_module[1] # Check if module is in file. # Assign None to file if it's .so. # Assign None to module if it's .c. if module in file: if ".so" in file: file = None if ".c" in module: module = None name = "[UNWIND] " + name_and_module[0] else: # We just need to take name if it is a parent name = "[UNWIND] " + node_info[1].split()[0] elif symbol == " C ": # Check if there is a C symbol. # "C" symbol means it's a C function. node_info = node_info.split(symbol) name = node_info[0] # We don't need file and module information if it's a parent node. if not is_parent: if "[{" in node_info[1]: # Example: <name> C [{<file>} {<line>}] node_info = node_info[1].split() file = node_info[0].strip("}[{") else: if "[{" in node_info: # If there isn't C or [@] # Example: [<type>] <name> [{} {}] node_info = node_info.split(" [{") name = node_info[0] # We don't need file and module information if it's a parent node. if not is_parent: file = node_info[1].split()[0].strip("}{") else: # Example 1: [<type>] <name> <module> # Example 2: [<type>] <name> # Example 3: <name> name = node_info node_info = node_info.split() # We need to take module information from the first example. # Another example is "[CONTEXT] .TAU application" which contradicts # with the first example. So we check if there is "\" symbol which # will show the module information in this case. if len(node_info) == 3 and "/" in name: name = node_info[0] + " " + node_info[1] # We don't need file and module information if it's a parent node. if not is_parent: module = node_info[2] return [name, file, module] def _get_line_numbers(node_info): start_line, end_line = 0, 0 # There should be [{}] symbols if there is line number information. if "[{" in node_info: tmp_module_or_file_line = (re.search( r"\{.*\}\]", node_info).group(0).split()) line_numbers = tmp_module_or_file_line[1].strip("}]").replace( "{", "") start_line = line_numbers if "-" in line_numbers: # Sometimes there is "-" between start line and end line # Example: {341,1}-{396,1} line_numbers = line_numbers.split("-") start_line = line_numbers[0].split(",")[0] end_line = line_numbers[1].split(",")[0] else: if "," in line_numbers: # Sometimes we don't have "-". # Example: {15,0} start_line = line_numbers.split(",")[0] end_line = line_numbers.split(",")[1] return [start_line, end_line] def _create_parent(child_node, parent_callpath): """In TAU output, sometimes we see a node as a parent in the callpath before we see it as a leaf node. In this case, we need to create a hatchet node for the parent. We can't create a node_dict for the parent because we don't know its metric values when we first see it in a callpath. Example: a => b => c "<c_metric_values>" Here, if we haven't seen 'b' before, we should create it when we create 'c'. This function recursively creates parent nodes in a callpath until it reaches the already existing parent in that callpath. """ parent_node = self.callpath_to_node.get(parent_callpath) # Return if arrives to the parent # Else create a parent and add parent/child if parent_node is not None: parent_node.add_child(child_node) child_node.add_parent(parent_node) return else: grand_parent_callpath = parent_callpath[:-1] parent_info = parent_callpath[-1] parent_name = "" if " C " in parent_info: parent_name = _get_name_file_module( True, parent_info, " C ")[0] elif " [@] " in parent_info: parent_name = _get_name_file_module( True, parent_info, " [@] ")[0] else: parent_name = _get_name_file_module(True, parent_info, "")[0] parent_node = Node( Frame({ "type": "function", "name": parent_name }), None) self.callpath_to_node[parent_callpath] = parent_node parent_node.add_child(child_node) child_node.add_parent(parent_node) _create_parent(parent_node, grand_parent_callpath) def _construct_column_list(first_rank_filenames): """This function constructs columns, exc_metrics, and inc_metrics using all metric files of a rank. It gets the all metric files of a rank as a tuple and only loads the second line (metadata) of these files. """ columns = [] for file_index in range(len(first_rank_filenames)): with open(first_rank_filenames[file_index], "r") as f: # Skip the first line: "192 templated_functions_MULTI_TIME" next(f) # No need to check if the metadata is the same for all metric files. metadata = next(f) # Get first three columns from # Name Calls Subrs Excl Incl ProfileCalls # # ProfileCalls is removed since it is is typically set to 0 and not used. # We only do this once since these column names are the same for all files. if file_index == 0: columns.extend( re.match(r"\#\s(.*)\s\#", metadata).group(1).split(" ")[:-3]) # Example metric_name: "PAPI_L2_TCM" # TODO: Decide if Calls and Subrs should be inc or exc metrics metric_name = re.search(r"<value>(.*?)<\/value>", metadata).group(1) if metric_name == "CPU_TIME" or metric_name == "TIME": metric_name = "time" elif metric_name == "Name": metric_name == "name" columns.extend([metric_name, metric_name + " (inc)"]) self.exc_metrics.append(metric_name) self.inc_metrics.append(metric_name + " (inc)") return columns # dirpath -> returns path of a directory, string # dirnames -> returns directory names, list # files -> returns filenames in a directory, list profile_filenames = [] for dirpath, dirnames, files in os.walk(self.dirname): profiles_in_dir = glob.glob(dirpath + "/profile.*") if profiles_in_dir: # sort input files in each directory in the same order profile_filenames.append(sorted(profiles_in_dir)) # Store all files in a list of tuples. # Each tuple stores all the metric files of a rank. # We process one rank at a time. # Example: [(metric1/profile.x.0.0, metric2/profile.x.0.0), ...] profile_filenames = list(zip(*profile_filenames)) # Get column information from the metric files of a rank. self.columns = _construct_column_list(profile_filenames[0]) list_roots = [] prev_rank, prev_thread = 0, 0 # Example filenames_per_rank: (metric1/profile.x.0.0 ...) for filenames_per_rank in profile_filenames: file_info = filenames_per_rank[0].split(".") rank, thread = int(file_info[-3]), int(file_info[-1]) if not self.multiple_ranks: self.multiple_ranks = True if rank != prev_rank else False if not self.multiple_threads: self.multiple_threads = True if thread != prev_thread else False # Load all files represent a different metric for a rank or a thread. # If there are 2 metrics, load metric1\profile.x.0.0 and metric2\profile.x.0.0 file_data = [] for f_index in range(len(filenames_per_rank)): # Store the lines after metadata. file_data.append( open(filenames_per_rank[f_index], "r").readlines()[2:]) # Get the root information from only the first file to compare them # with others. # Example: ".TAU application" 1 1 272 15755429 0 GROUP="TAU_DEFAULT" root_line = re.match(r"\"(.*)\"\s(.*)\sG", file_data[0][0]) root_name = root_line.group(1).strip(" ") # convert it to a tuple to use it as a key in callpath_to_node dictionary root_callpath = tuple([root_name]) root_values = list(map(int, root_line.group(2).split(" ")[:-1])) # After first profile.0.0.0, only get Excl and Incl metric values # from other files since other columns will be the same. # We assume each metric file of a rank has the same root. first_file_root_name = re.search(r"\"(.*?)\"", file_data[0][0]).group(1) for f_index in range(1, len(file_data)): root_name = re.search(r"\"(.*?)\"", file_data[f_index][0]).group(1) # Below assert statement throws an error if the roots are not the # same for different metric files. # TODO: We need to find a solution if this throws an error. assert first_file_root_name == root_name, ( "Metric files for a rank has different roots.\n" + "File: " + filenames_per_rank[f_index] + "\nLine: 2") root_line = re.match(r"\"(.*)\"\s(.*)\sG", file_data[f_index][0]) root_values.extend( list(map(int, root_line.group(2).split(" ")[2:4]))) # Check if the root exists in other ranks. # Note that we assume the root is the same for all metric files of a rank. if root_callpath not in self.callpath_to_node: # Create the root node since it doesn't exist root_node = Node( Frame({ "name": root_name, "type": "function" }), None) # Store callpaths to identify nodes self.callpath_to_node[root_callpath] = root_node list_roots.append(root_node) else: # Don't create a new node since it is created earlier root_node = self.callpath_to_node.get(root_callpath) node_dict = self.create_node_dict( root_node, self.columns, root_values, root_name, None, None, 0, 0, rank, thread, ) self.node_dicts.append(node_dict) # Start from the line after root. # Iterate over only the first metric file of a rank # since the lines should be exactly the same across # all metric files of a rank. # Uses the same "line_index" for other metric files of a rank. for line_index in range(1, len(file_data[0])): line = file_data[0][line_index] metric_values = [] # We only parse the lines that has "=>" symbol which shows the callpath info. # We just skip the other lines. if "=>" in line: # Example: ".TAU application => foo() => bar()" 31 0 155019 155019 0 GROUP="TAU_SAMPLE|TAU_CALLPATH" callpath_line_regex = re.match(r"\"(.*)\"\s(.*)\sG", line) # callpath: ".TAU application => foo() => bar()" callpath = [ name.strip(" ") for name in callpath_line_regex.group(1).split("=>") ] # Example leaf_name: StrToInt [{lulesh-util.cc} {13,1}-{29,1}] leaf_name = callpath[-1] callpath = tuple(callpath) parent_callpath = callpath[:-1] # Don't include the value for ProfileCalls. # metric_values: 31 0 155019 155019 metric_values = list( map(float, callpath_line_regex.group(2).split(" ")[:-1])) # Get start and end line information leaf_line_numbers = _get_line_numbers(leaf_name) # Get name, file, and module information using the leaf name # and the symbol on it if " C " in leaf_name: leaf_name_file_module = _get_name_file_module( False, leaf_name, " C ") elif " [@] " in leaf_name: leaf_name_file_module = _get_name_file_module( False, leaf_name, " [@] ") else: leaf_name_file_module = _get_name_file_module( False, leaf_name, "") # Example: ".TAU application => foo() => bar()" 31 0 155019..." first_file_callpath_line = re.search( r"\"(.*?)\"", file_data[0][line_index]).group(1) # After first profile.x.0.0, only get Excl and Incl metric values # from other files. for f_index in range(1, len(file_data)): other_file_callpath_line = re.search( r"\"(.*?)\"", file_data[f_index][line_index]).group(1) # We assume metric files of a rank should have the exact same lines. # Only difference should be the Incl and Excl metric values. # TODO: We should find a solution if this raises an error. assert first_file_callpath_line == other_file_callpath_line, ( "Lines across metric files for a rank are not the same.\n" + "File: " + filenames_per_rank[f_index] + "\nLine: " + str(line_index + 3)) # Get the information from the same line in each file. "line_index". callpath_line_regex = re.match( r"\"(.*)\"\s(.*)\sG", file_data[f_index][line_index]) metric_values.extend( map(float, callpath_line_regex.group(2).split(" ")[2:4])) leaf_node = self.callpath_to_node.get(callpath) # Check if that node is created earlier if leaf_node is None: # Create the node since it doesn't exist leaf_node = Node( Frame({ "type": "function", "name": leaf_name_file_module[0] }), None, ) self.callpath_to_node[callpath] = leaf_node # Get its parent from its callpath. parent_node = self.callpath_to_node.get( parent_callpath) if parent_node is None: # Create parent if it doesn't exist. _create_parent(leaf_node, parent_callpath) else: parent_node.add_child(leaf_node) leaf_node.add_parent(parent_node) node_dict = self.create_node_dict( leaf_node, self.columns, metric_values, # name leaf_name_file_module[0], # file leaf_name_file_module[1], # module leaf_name_file_module[2], # start line leaf_line_numbers[0], # end line leaf_line_numbers[1], rank, thread, ) self.node_dicts.append(node_dict) return list_roots
def read(self): """Read the caliper JSON file to extract the calling context tree.""" with self.timer.phase("read json"): self.read_json_sections() with self.timer.phase("graph construction"): list_roots = self.create_graph() # create a dataframe of metrics from the data section self.df_json_data = pd.DataFrame(self.json_data, columns=self.json_cols) # map non-numeric columns to their mappings in the nodes section for idx, item in enumerate(self.json_cols_mdata): if item["is_value"] is False and self.json_cols[ idx] != self.nid_col_name: if self.json_cols[idx] == "sourceloc#cali.sampler.pc": # split source file and line number into two columns self.df_json_data["file"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(1)) self.df_json_data["line"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(2)) self.df_json_data.drop(self.json_cols[idx], axis=1, inplace=True) sourceloc_idx = idx else: self.df_json_data[self.json_cols[idx]] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: self.json_nodes[x]["label"]) # since we split sourceloc, we should update json_cols and # json_cols_mdata if "sourceloc#cali.sampler.pc" in self.json_cols: self.json_cols.pop(sourceloc_idx) self.json_cols_mdata.pop(sourceloc_idx) self.json_cols.append("file") self.json_cols.append("line") self.json_cols_mdata.append({"is_value": False}) self.json_cols_mdata.append({"is_value": False}) max_nid = self.df_json_data[self.nid_col_name].max() if "line" in self.df_json_data.columns: # split nodes that have multiple file:line numbers to have a child # each with a unique file:line number unique_nodes = self.df_json_data.groupby(self.nid_col_name) df_concat = [self.df_json_data] for nid, super_node in unique_nodes: line_groups = super_node.groupby("line") # only need to do something if there are more than one # file:line number entries for the node if len(line_groups.size()) > 1: sn_hnode = self.idx_to_node[nid]["node"] for line, line_group in line_groups: # create the node label file_path = (line_group.head(1))["file"].item() file_name = os.path.basename(file_path) node_label = file_name + ":" + line # create a new hatchet node max_nid += 1 idx = max_nid hnode = Node( Frame({ "type": "statement", "file": file_path, "line": line }), sn_hnode, ) sn_hnode.add_child(hnode) node_dict = { self.nid_col_name: idx, "name": node_label, "node": hnode, } self.idx_to_node[idx] = node_dict # change nid of the original node to new node in place for index, row in line_group.iterrows(): self.df_json_data.loc[index, "nid"] = max_nid # add new row for original node node_copy = super_node.head(1).copy() for cols in self.metric_columns: node_copy[cols] = 0 df_concat.append(node_copy) # concatenate all the newly created dataframes with # self.df_json_data self.df_fixed_data = pd.concat(df_concat) else: self.df_fixed_data = self.df_json_data # create a dataframe with all nodes in the call graph self.df_nodes = pd.DataFrame.from_dict( data=list(self.idx_to_node.values())) # add missing intermediate nodes to the df_fixed_data dataframe if "rank" in self.json_cols: self.num_ranks = self.df_fixed_data["rank"].max() + 1 rank_list = range(0, self.num_ranks) # create a standard dict to be used for filling all missing rows default_metric_dict = {} for idx, item in enumerate(self.json_cols_mdata): if self.json_cols[idx] != self.nid_col_name: if item["is_value"] is True: default_metric_dict[self.json_cols[idx]] = 0 else: default_metric_dict[self.json_cols[idx]] = None # create a list of dicts, one dict for each missing row missing_nodes = [] for iteridx, row in self.df_nodes.iterrows(): # check if df_nodes row exists in df_fixed_data metric_rows = self.df_fixed_data.loc[self.df_fixed_data[ self.nid_col_name] == row[self.nid_col_name]] if "rank" not in self.json_cols: if metric_rows.empty: # add a single row node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] missing_nodes.append(node_dict) else: if metric_rows.empty: # add a row per MPI rank for rank in rank_list: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) elif len(metric_rows) < self.num_ranks: # add a row for each missing MPI rank present_ranks = metric_rows["rank"].values missing_ranks = [ x for x in rank_list if x not in present_ranks ] for rank in missing_ranks: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) self.df_missing = pd.DataFrame.from_dict(data=missing_nodes) self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing]) # create a graph object once all the nodes have been added graph = Graph(list_roots) graph.enumerate_traverse() # merge the metrics and node dataframes on the idx column with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on=self.nid_col_name) # set the index to be a MultiIndex indices = ["node"] if "rank" in self.json_cols: indices.append("rank") dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def parse_xml_node(self, xml_node, parent_nid, parent_line, hparent): """Parses an XML node and its children recursively.""" nid = int(xml_node.get("i")) global src_file xml_tag = xml_node.tag if xml_tag == "PF" or xml_tag == "Pr": # procedure name = self.procedure_names[xml_node.get("n")] if parent_line != 0: name = str(parent_line) + ":" + name src_file = xml_node.get("f") line = int(xml_node.get("l")) hnode = Node(Frame({"type": "function", "name": name}), hparent) node_dict = self.create_node_dict( nid, hnode, name, xml_tag, self.src_files[src_file], line, self.load_modules[xml_node.get("lm")], ) elif xml_tag == "L": # loop src_file = xml_node.get("f") line = int(xml_node.get("l")) name = ("Loop@" + os.path.basename(self.src_files[src_file]) + ":" + str(line)) hnode = Node( Frame({ "type": "loop", "file": self.src_files[src_file], "line": line }), hparent, ) node_dict = self.create_node_dict(nid, hnode, name, xml_tag, self.src_files[src_file], line, None) elif xml_tag == "S": # statement line = int(xml_node.get("l")) # this might not be required for resolving conflicts name = os.path.basename(self.src_files[src_file]) + ":" + str(line) hnode = Node( Frame({ "type": "statement", "file": self.src_files[src_file], "line": line, }), hparent, ) node_dict = self.create_node_dict(nid, hnode, name, xml_tag, self.src_files[src_file], line, None) # when we reach statement nodes, we subtract their exclusive # metric values from the parent's values for i, column in enumerate(self.metric_columns): if "(inc)" not in column and "(I)" not in column: _crm.subtract_exclusive_metric_vals( nid, parent_nid, self.np_metrics.T[i], self.total_execution_threads, self.num_nodes, ) if xml_tag == "C" or (xml_tag == "Pr" and self.procedure_names[xml_node.get("n")] == ""): # do not add a node to the graph if the xml_tag is a callsite # or if its a procedure with no name # for Prs, the preceding Pr has the calling line number and for # PFs, the preceding C has the line number line = int(xml_node.get("l")) self.parse_xml_children(xml_node, hparent) else: self.node_dicts.append(node_dict) hparent.add_child(hnode) self.parse_xml_children(xml_node, hnode)
def read(self): """Read the experiment.xml file to extract the calling context tree and create a dataframe out of it. Then merge the two dataframes to create the final dataframe. Return: (GraphFrame): new GraphFrame with HPCToolkit data. """ with self.timer.phase("fill tables"): self.fill_tables() with self.timer.phase("read metric db"): self.read_all_metricdb_files() list_roots = [] # parse the ElementTree to generate a calling context tree for root in self.callpath_profile.findall("PF"): global src_file nid = int(root.get("i")) src_file = root.get("f") # start with the root and create the callpath and node for the root # also a corresponding node_dict to be inserted into the dataframe graph_root = Node( Frame({ "type": "function", "name": self.procedure_names[root.get("n")] }), None, ) node_dict = self.create_node_dict( nid, graph_root, self.procedure_names[root.get("n")], "PF", self.src_files[src_file], int(root.get("l")), self.load_modules[root.get("lm")], ) self.node_dicts.append(node_dict) list_roots.append(graph_root) # start graph construction at the root with self.timer.phase("graph construction"): self.parse_xml_children(root, graph_root) # put updated metrics back in dataframe for i, column in enumerate(self.metric_columns): if "(inc)" not in column and "(I)" not in column: self.df_metrics[column] = self.np_metrics.T[i] with self.timer.phase("graph construction"): graph = Graph(list_roots) graph.enumerate_traverse() # create a dataframe for all the nodes in the graph self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts) # merge the metrics and node dataframes with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid") # set the index to be a MultiIndex if self.num_threads_per_rank > 1: indices = ["node", "rank", "thread"] # if number of threads per rank is 1, do not make thread an index elif self.num_threads_per_rank == 1: indices = ["node", "rank"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column or "(I)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)