예제 #1
0
def test_filter_squash_bunny_to_goat_with_merge():
    r"""Test squash on a "bunny" shaped graph:

    This one is more complex because there are more transitive edges to
    maintain between the roots (e, g) and b and c.

          e   g
         / \ / \
        f   a   h    remove ac      e   g
           / \      ---------->    / \ / \
          b   c                   f   b   h
           \ /
            b

    """
    b = Node(Frame(name="b"))
    diamond = Node.from_lists(("a", ("b", b), ("c", b)))

    new_b = Node(Frame(name="b"))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        Graph.from_lists(("e", new_b, "f"), ("g", new_b, "h")),
        [4, 2, 1, 4, 1],  # e, b, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        5,  # e, b, f, g, h
    )
예제 #2
0
def test_traverse_pre():
    node = Node(Frame(name="a"))
    assert list(node.traverse(attrs="name")) == ["a"]

    node = Node.from_lists(["a", ["b", "d", "e"], ["c", "f", "g"]])
    assert list(
        node.traverse(attrs="name")) == ["a", "b", "d", "e", "c", "f", "g"]
예제 #3
0
def test_filter_squash_bunny():
    r"""Test squash on a complicated "bunny" shaped graph.

    This has multiple roots as well as multiple parents that themselves
    have parents.

          e   g
         / \ / \
        f   a   h    remove abc     e   g
           / \      ----------->   / \ / \
          b   c                   f   d   h
           \ /
            d

    """
    d = Node(Frame(name="d"))
    diamond = Node.from_lists(("a", ("b", d), ("c", d)))

    new_d = Node(Frame(name="d"))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c"),
        Graph.from_lists(("e", new_d, "f"), ("g", new_d, "h")),
        [3, 1, 1, 3, 1],  # e, d, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c"),
        5,  # e, d, f, g, h
    )
예제 #4
0
def check_dag_equal():
    chain = Node.from_lists(("a", ("b", ("c", ("d", )))))

    d = Node(Frame(name="d"))
    diamond = Node.from_lists(("a", ("b", d), ("c", d)))

    tree = Node.from_lists(("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"),
                            ("d", "e", "f", "g")))

    assert chain.dag_equal(chain)
    assert chain.dag_equal(chain.copy())

    assert diamond.dag_equal(diamond)
    assert diamond.dag_equal(diamond.copy())

    assert tree.dag_equal(tree)
    assert tree.dag_equal(tree.copy())

    assert not chain.dag_equal(tree)
    assert not chain.dag_equal(diamond)

    assert not tree.dag_equal(chain)
    assert not tree.dag_equal(diamond)

    assert not diamond.dag_equal(chain)
    assert not diamond.dag_equal(tree)
예제 #5
0
def test_filter_squash_bunny_to_goat():
    r"""Test squash on a "bunny" shaped graph:

    This one is more complex because there are more transitive edges to
    maintain between the roots (e, g) and b and c.

          e   g                     e   g
         / \ / \                   /|\ /|\
        f   a   h    remove ac    f | b | h
           / \      ---------->     | | |
          b   c                      \|/
           \ /                        d
            d

    """
    d = Node(Frame(name="d"))
    diamond = Node.from_lists(("a", ("b", d), ("c", d)))

    new_d = Node(Frame(name="d"))
    new_b = Node.from_lists(("b", new_d))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        Graph.from_lists(("e", new_b, new_d, "f"), ("g", new_b, new_d, "h")),
        [4, 2, 1, 1, 4, 1],  # e, b, d, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        6,  # e, b, d, f, g, h
    )
예제 #6
0
def test_copy():
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))
    g = Graph.from_lists(("e", "f", diamond_subdag),
                         ("g", diamond_subdag, "h"))

    assert g.copy() == g
예제 #7
0
def test_traverse_paths():
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))

    g = Graph.from_lists(("e", "f", diamond_subdag),
                         ("g", diamond_subdag, "h"))
    assert list(
        g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
예제 #8
0
파일: graph.py 프로젝트: shishirccr/hatchet
def test_from_lists():
    """Ensure we can traverse roots in correct order without repeating a
       shared subdag.
    """
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))

    g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h"))
    assert list(g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
예제 #9
0
파일: graph.py 프로젝트: shishirccr/hatchet
def test_dag_is_not_tree():
    g = Graph.from_lists(("b", "c"), ("d", "e"))
    assert not g.is_tree()

    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))
    g = Graph([diamond_subdag])
    assert not g.is_tree()

    g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h"))
    assert not g.is_tree()
예제 #10
0
    def __create_graph(self, rank, thread, list_roots, node_dicts):
        graph_data = self.dict["ranks"][str(rank)]["threads"][str(
            thread)]["regions"]
        for region, data in iter(graph_data.items()):
            # print(region, data)

            frame = Frame({"type": "region", "name": data["name"]})
            node = Node(frame, None)

            contain_read_events = [0]
            metrics = self.__get_metrics(data, contain_read_events)

            node_dict = dict({
                "name": data["name"],
                "node": node,
                "rank": int(rank),
                "thread": int(thread),
                **metrics,
            })
            node_dicts.append(node_dict)

            # used to find node using parent_region_id
            self.node_graph_dict[int(region)] = [data["name"], node]

            if int(data["parent_region_id"]) == -1:
                list_roots.append(node)
            else:
                self.__add_child_node(int(data["parent_region_id"]), node)

            # check if we have to create child nodes for read events
            if contain_read_events[0] == 1:

                # check how many read calls are used
                read_num = len(data["cycles"])

                for i in range(1, read_num):
                    node_name_read = "read_" + str(i)

                    read_frame = Frame({
                        "type": "region",
                        "name": node_name_read
                    })
                    read_node = Node(read_frame, node)
                    read_metrics = self.__get_read_metrics(
                        data, node_name_read)
                    node_dict = dict({
                        "name": node_name_read,
                        "node": read_node,
                        "rank": int(rank),
                        "thread": int(thread),
                        **read_metrics,
                    })
                    node_dicts.append(node_dict)
                    node.add_child(read_node)
예제 #11
0
def test_path():
    d = Node(Frame(name="d"))
    node = Node.from_lists(["a", ["b", d]])

    assert d.path() == (Frame(name="a"), Frame(name="b"), Frame(name="d"))
    assert d.parents[0].path() == (Frame(name="a"), Frame(name="b"))
    assert node.path() == (Frame(name="a"), )

    assert d.path(attrs="name") == ("a", "b", "d")
    assert d.parents[0].path(attrs="name") == ("a", "b")
    assert node.path(attrs="name") == ("a", )
예제 #12
0
파일: node.py 프로젝트: jrmadsen/hatchet
def test_path():
    d = Node(Frame(name="d", type="function"))
    node = Node.from_lists(["a", ["b", d]])

    assert d.path() == (
        Node(Frame(name="a")),
        Node(Frame(name="b")),
        Node(Frame(name="d", type="function")),
    )
    assert d.parents[0].path() == (Node(Frame(name="a")),
                                   Node(Frame(name="b")))
    assert node.path() == (Node(Frame(name="a")), )
예제 #13
0
def test_from_lists():
    node = Node.from_lists("a")
    assert node.frame == Frame(name="a")

    a = Frame(name="a")
    b = Frame(name="b")
    c = Frame(name="c")

    node = Node.from_lists(["a", ["b", "c"]])

    assert node.frame == a
    assert node.children[0].frame == b
    assert node.children[0].children[0].frame == c
예제 #14
0
        def _create_parent(child_node, parent_callpath):
            """In TAU output, sometimes we see a node as a parent
            in the callpath before we see it as a leaf node. In
            this case, we need to create a hatchet node for the parent.

            We can't create a node_dict for the parent because we don't
            know its metric values when we first see it in a callpath.

            Example: a => b => c "<c_metric_values>"
            Here, if we haven't seen 'b' before, we should create it when we
            create 'c'.

            This function recursively creates parent nodes in a callpath
            until it reaches the already existing parent in that callpath.
            """
            parent_node = self.callpath_to_node.get(parent_callpath)

            # Return if arrives to the parent
            # Else create a parent and add parent/child
            if parent_node is not None:
                parent_node.add_child(child_node)
                child_node.add_parent(parent_node)
                return
            else:
                grand_parent_callpath = parent_callpath[:-1]
                parent_info = parent_callpath[-1]
                parent_name = ""

                if " C " in parent_info:
                    parent_name = _get_name_file_module(
                        True, parent_info, " C ")[0]
                elif " [@] " in parent_info:
                    parent_name = _get_name_file_module(
                        True, parent_info, " [@] ")[0]
                else:
                    parent_name = _get_name_file_module(True, parent_info,
                                                        "")[0]

                parent_node = Node(
                    Frame({
                        "type": "function",
                        "name": parent_name
                    }), None)
                self.callpath_to_node[parent_callpath] = parent_node

                parent_node.add_child(child_node)
                child_node.add_parent(parent_node)
                _create_parent(parent_node, grand_parent_callpath)
예제 #15
0
    def create_graph(self):
        list_roots = []

        global unknown_label_counter

        # find nodes in the nodes section that represent the path hierarchy
        for idx, node in enumerate(self.json_nodes):
            node_label = node["label"]
            if node_label == "":
                node_label = "UNKNOWN " + str(unknown_label_counter)
                unknown_label_counter += 1
            self.idx_to_label[idx] = node_label

            if node["column"] == self.path_col_name:
                if "parent" not in node:
                    # since this node does not have a parent, this is a root
                    graph_root = Node(
                        Frame({
                            "type": self.node_type,
                            "name": node_label
                        }), None)
                    list_roots.append(graph_root)

                    node_dict = {
                        self.nid_col_name: idx,
                        "name": node_label,
                        "node": graph_root,
                    }
                    self.idx_to_node[idx] = node_dict
                else:
                    parent_hnode = (self.idx_to_node[node["parent"]])["node"]
                    hnode = Node(
                        Frame({
                            "type": self.node_type,
                            "name": node_label
                        }),
                        parent_hnode,
                    )
                    parent_hnode.add_child(hnode)

                    node_dict = {
                        self.nid_col_name: idx,
                        "name": node_label,
                        "node": hnode,
                    }
                    self.idx_to_node[idx] = node_dict

        return list_roots
예제 #16
0
def test_filter_squash_diamond():
    r"""Test that diamond edges are collapsed when squashing.

    Ensure we can handle the most basic DAG.

            a
           / \      remove bc     a
          b   c    ---------->    |
           \ /                    d
            d

    """
    d = Node(Frame(name="d"))
    check_filter_squash(
        GraphFrame.from_lists(("a", ("b", d), ("c", d))),
        lambda row: row["node"].frame["name"] not in ("b", "c"),
        Graph.from_lists(("a", "d")),
        [2, 1],  # a, d
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("a", ("b", d), ("c", d))),
        lambda row: row["node"].frame["name"] not in ("b", "c"),
        2,  # a, d
    )
예제 #17
0
        def parse_node_literal(child_dict, hparent):
            """Create node_dict for one node and then call the function
            recursively on all children."""

            hnode = Node(
                Frame({
                    "name": child_dict["function"],
                    "type": "function"
                }), hparent)

            child_node_dict = {
                "node": hnode,
                "name": child_dict["function"],
                "file": child_dict["file_path_short"],
                "line": child_dict["line_no"],
                "time": child_dict["time"],
                "time (inc)": child_dict["time"],
                "is_application_code": child_dict["is_application_code"],
            }

            hparent.add_child(hnode)
            self.node_dicts.append(child_node_dict)

            if "children" in child_dict:
                for child in child_dict["children"]:
                    # Pyinstrument's time metric actually stores inclusive time.
                    # To calculate exclusive time, we subtract the children's time
                    # from the parent's time.
                    child_node_dict["time"] -= child["time"]
                    parse_node_literal(child, hnode)
예제 #18
0
def test_union_dag():
    # make graphs g1, g2, and g3, where you know g3 is the union of g1 and g2
    c = Node.from_lists(("c", "d"))
    g1 = Graph.from_lists(("a", ("b", c), ("e", c, "f")))

    d = Node(Frame(name="d"))
    g2 = Graph.from_lists(("a", ("b", ("c", d)), ("e", d, "f")))

    d2 = Node(Frame(name="d"))
    c2 = Node.from_lists(("c", d2))
    g3 = Graph.from_lists(("a", ("b", c2), ("e", c2, d2, "f")))

    assert g1 != g2

    g4 = g1.union(g2)

    assert g4 == g3
예제 #19
0
    def read(self):
        list_roots = []
        node_dicts = []
        frame_to_node_dict = {}
        frame = None
        seen_nids = []
        hnid = -1

        # start with creating a node_dict for each root
        for i in range(len(self.graph_dict)):
            if "_hatchet_nid" in self.graph_dict[i]["metrics"]:
                hnid = self.graph_dict[i]["metrics"]["_hatchet_nid"]
                seen_nids.append(hnid)
            frame = Frame(self.graph_dict[i]["frame"])
            graph_root = Node(frame, None, hnid=hnid)

            # depending on the node type, the name may not be in the frame
            node_name = self.graph_dict[i]["frame"].get("name")
            if not node_name:
                node_name = self.graph_dict[i]["name"]

            node_dict = dict({
                "node": graph_root,
                "name": node_name
            }, **self.graph_dict[i]["metrics"])
            node_dicts.append(node_dict)

            list_roots.append(graph_root)
            frame_to_node_dict[frame] = graph_root

            # call recursively on all children of root
            if "children" in self.graph_dict[i]:
                for child in self.graph_dict[i]["children"]:
                    self.parse_node_literal(frame_to_node_dict, node_dicts,
                                            child, graph_root, seen_nids)

        graph = Graph(list_roots)

        # test if nids are already loaded
        if -1 in [n._hatchet_nid for n in graph.traverse()]:
            graph.enumerate_traverse()
        else:
            graph.enumerate_depth()

        exc_metrics = []
        inc_metrics = []
        for key in self.graph_dict[i]["metrics"].keys():
            if "(inc)" in key:
                inc_metrics.append(key)
            else:
                exc_metrics.append(key)

        dataframe = pd.DataFrame(data=node_dicts)
        dataframe.set_index(["node"], inplace=True)
        dataframe.sort_index(inplace=True)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
예제 #20
0
    def parse_node_literal(self, frame_to_node_dict, node_dicts, child_dict,
                           hparent):
        """Create node_dict for one node and then call the function
        recursively on all children.
        """
        frame = Frame(child_dict["frame"])
        if "duplicate" not in child_dict:
            hnode = Node(frame, hparent)

            # depending on the node type, the name may not be in the frame
            node_name = child_dict["frame"].get("name")
            if not node_name:
                node_name = child_dict["name"]

            node_dict = dict({
                "node": hnode,
                "name": node_name
            }, **child_dict["metrics"])

            node_dicts.append(node_dict)
            frame_to_node_dict[frame] = hnode
        elif "duplicate" in child_dict:
            hnode = frame_to_node_dict.get(frame)
            if not hnode:
                hnode = Node(frame, hparent)

                # depending on the node type, the name may not be in the frame
                node_name = child_dict["frame"].get("name")
                if not node_name:
                    node_name = child_dict["name"]

                node_dict = dict({
                    "node": hnode,
                    "name": node_name
                }, **child_dict["metrics"])
                node_dicts.append(node_dict)
                frame_to_node_dict[frame] = hnode

        hparent.add_child(hnode)

        if "children" in child_dict:
            for child in child_dict["children"]:
                self.parse_node_literal(frame_to_node_dict, node_dicts, child,
                                        hnode)
예제 #21
0
def test_paths():
    d = Node(Frame(name="d"))
    Node.from_lists(["a", ["b", d], ["c", d]])
    with pytest.raises(MultiplePathError):
        d.path()

    assert d.paths() == [
        (Frame(name="a"), Frame(name="b"), Frame(name="d")),
        (Frame(name="a"), Frame(name="c"), Frame(name="d")),
    ]

    assert d.paths(attrs="name") == [("a", "b", "d"), ("a", "c", "d")]
예제 #22
0
def test_traverse_post():
    node = Node.from_lists(["a", ["b", "d", "e"], ["c", "f", "g"]])
    assert list(node.traverse(order="post", attrs="name")) == [
        "d",
        "e",
        "b",
        "f",
        "g",
        "c",
        "a",
    ]
예제 #23
0
def test_to_literal_node_ids():
    r"""Test to_literal and from_literal with ids on a graph with cycles,
        multiple parents and children.

        a --
       / \ /
      b   c
       \ /
        d
       / \
      e   f
    """

    a = Node(Frame(name="a"))
    d = Node(Frame(name="d"))
    gf = GraphFrame.from_lists([a, ["b", [d]], ["c", [d, ["e"], ["f"]], [a]]])
    lit_list = gf.to_literal()

    gf2 = gf.from_literal(lit_list)
    lit_list2 = gf2.to_literal()

    assert lit_list == lit_list2
예제 #24
0
    def parse_node_literal(
        self, frame_to_node_dict, node_dicts, child_dict, hparent, seen_nids
    ):
        """Create node_dict for one node and then call the function
        recursively on all children.
        """

        # pull out _hatchet_nid if it exists
        # so it will not be inserted into
        # dataframe like a normal metric
        hnid = -1
        if "_hatchet_nid" in child_dict["metrics"]:
            hnid = child_dict["metrics"]["_hatchet_nid"]

        frame = Frame(child_dict["frame"])
        if hnid not in seen_nids:
            hnode = Node(frame, hparent, hnid=hnid)

            # depending on the node type, the name may not be in the frame
            node_name = child_dict["frame"].get("name")
            if not node_name:
                node_name = child_dict["name"]

            node_dict = dict(
                {"node": hnode, "name": node_name}, **child_dict["metrics"]
            )

            node_dicts.append(node_dict)
            frame_to_node_dict[frame] = hnode

            if hnid != -1:
                seen_nids.append(hnid)

        else:
            hnode = frame_to_node_dict.get(frame)

        hparent.add_child(hnode)

        if "children" in child_dict:
            for child in child_dict["children"]:
                self.parse_node_literal(
                    frame_to_node_dict, node_dicts, child, hnode, seen_nids
                )
예제 #25
0
    def _create_node_and_row(self, fn_data, fn_name, stats_dict):
        """
        Description: Takes a profiled function as specified in a pstats file
        and creates a node for it and adds a new line of metadata to our
        dataframe if it does not exist.
        """
        u_fn_name = "{}:{}:{}".format(
            fn_name,
            fn_data[NameData.FILE].split("/")[-1],
            fn_data[NameData.LINE],
        )
        fn_hnode = self.name_to_hnode.get(u_fn_name)

        if not fn_hnode:
            # create a node if it doesn't exist yet
            fn_hnode = Node(Frame({"type": "function", "name": fn_name}), None)
            self.name_to_hnode[u_fn_name] = fn_hnode

            # lookup stat data for source here
            fn_stats = stats_dict[fn_data]
            self._add_node_metadata(u_fn_name, fn_data, fn_stats, fn_hnode)

        return fn_hnode
예제 #26
0
def test_output_with_cycle_graphs():
    r"""Test three output modes on a graph with cycles,
        multiple parents and children.

        a --
       / \ /
      b   c
       \ /
        d
       / \
      e   f
    """

    dot_edges = [
        # d has two parents and two children
        '"1" -> "2";',
        '"5" -> "2";',
        '"2" -> "3";',
        '"2" -> "4";',
        # a -> c -> a cycle
        '"0" -> "5";',
        '"5" -> "0";',
    ]

    a = Node(Frame(name="a"))
    d = Node(Frame(name="d"))
    gf = GraphFrame.from_lists([a, ["b", [d]], ["c", [d, ["e"], ["f"]], [a]]])

    lit_list = gf.to_literal()
    treeout = gf.tree()
    dotout = gf.to_dot()

    # scan through litout produced dictionary for edges
    a_children = [n["frame"]["name"] for n in lit_list[0]["children"]]
    a_c_children = [n["frame"]["name"] for n in lit_list[0]["children"][1]["children"]]
    a_b_children = [n["frame"]["name"] for n in lit_list[0]["children"][0]["children"]]

    assert len(lit_list) == 1
    assert len(a_children) == 2

    # a -> (b,c)
    assert "b" in a_children
    assert "c" in a_children

    # a -> c -> a cycle
    assert "a" in a_c_children

    # d has two parents
    assert "d" in a_c_children
    assert "d" in a_b_children

    # check certain edges are in dot
    for edge in dot_edges:
        assert edge in dotout

    # check that a certain number of occurences
    # of same node are in tree indicating multiple
    # edges
    assert treeout.count("a") == 2
    assert treeout.count("d") == 2
    assert treeout.count("e") == 1
    assert treeout.count("f") == 1
예제 #27
0
    def create_graph(self):
        def _get_name_file_module(is_parent, node_info, symbol):
            """This function gets the name, file and module information
            for a node using the corresponding line in the output file.
            Example line: [UNWIND] <file> [@] <name> [{<file_or_module>} {<line>}]
            There are several line formats in TAU and this function gets
            the node information considering all these formats for which
            examples are given below.
            """
            name, file, module = None, None, None
            # There are several different formats in TAU outputs.
            # There might be file, line, and module information.
            # The following if-else block covers all possible output
            # formats. Example formats are given in comments.
            if symbol == " [@] ":
                # Check if there is a [@] symbol.
                node_info = node_info.split(symbol)
                # We don't need file and module information if it's a parent node.
                if not is_parent:
                    file = node_info[0].split()[1]
                    if "[{" in node_info[1]:
                        # Sometimes we see file and module information inside of [{}]
                        # Example: [UNWIND] <file> [@] <name> [{<file_or_module>} {<line>}]
                        name_and_module = node_info[1].split(" [{")
                        module = name_and_module[1].split()[0].strip("}")
                    else:
                        # Example: [UNWIND] <file> [@] <name> <module>
                        name_and_module = node_info[1].split()
                        module = name_and_module[1]

                    # Check if module is in file.
                    # Assign None to file if it's .so.
                    # Assign None to module if it's .c.
                    if module in file:
                        if ".so" in file:
                            file = None
                        if ".c" in module:
                            module = None
                    name = "[UNWIND] " + name_and_module[0]
                else:
                    # We just need to take name if it is a parent
                    name = "[UNWIND] " + node_info[1].split()[0]
            elif symbol == " C ":
                # Check if there is a C symbol.
                # "C" symbol means it's a C function.
                node_info = node_info.split(symbol)
                name = node_info[0]
                # We don't need file and module information if it's a parent node.
                if not is_parent:
                    if "[{" in node_info[1]:
                        # Example: <name> C [{<file>} {<line>}]
                        node_info = node_info[1].split()
                        file = node_info[0].strip("}[{")
            else:
                if "[{" in node_info:
                    # If there isn't C or [@]
                    # Example: [<type>] <name> [{} {}]
                    node_info = node_info.split(" [{")
                    name = node_info[0]
                    # We don't need file and module information if it's a parent node.
                    if not is_parent:
                        file = node_info[1].split()[0].strip("}{")
                else:
                    # Example 1: [<type>] <name> <module>
                    # Example 2: [<type>] <name>
                    # Example 3: <name>
                    name = node_info
                    node_info = node_info.split()
                    # We need to take module information from the first example.
                    # Another example is "[CONTEXT] .TAU application" which contradicts
                    # with the first example. So we check if there is "\" symbol which
                    # will show the module information in this case.
                    if len(node_info) == 3 and "/" in name:
                        name = node_info[0] + " " + node_info[1]
                        # We don't need file and module information if it's a parent node.
                        if not is_parent:
                            module = node_info[2]
            return [name, file, module]

        def _get_line_numbers(node_info):
            start_line, end_line = 0, 0
            # There should be [{}] symbols if there is line number information.
            if "[{" in node_info:
                tmp_module_or_file_line = (re.search(
                    r"\{.*\}\]", node_info).group(0).split())
                line_numbers = tmp_module_or_file_line[1].strip("}]").replace(
                    "{", "")
                start_line = line_numbers
                if "-" in line_numbers:
                    # Sometimes there is "-" between start line and end line
                    # Example: {341,1}-{396,1}
                    line_numbers = line_numbers.split("-")
                    start_line = line_numbers[0].split(",")[0]
                    end_line = line_numbers[1].split(",")[0]
                else:
                    if "," in line_numbers:
                        # Sometimes we don't have "-".
                        # Example: {15,0}
                        start_line = line_numbers.split(",")[0]
                        end_line = line_numbers.split(",")[1]
            return [start_line, end_line]

        def _create_parent(child_node, parent_callpath):
            """In TAU output, sometimes we see a node as a parent
            in the callpath before we see it as a leaf node. In
            this case, we need to create a hatchet node for the parent.

            We can't create a node_dict for the parent because we don't
            know its metric values when we first see it in a callpath.

            Example: a => b => c "<c_metric_values>"
            Here, if we haven't seen 'b' before, we should create it when we
            create 'c'.

            This function recursively creates parent nodes in a callpath
            until it reaches the already existing parent in that callpath.
            """
            parent_node = self.callpath_to_node.get(parent_callpath)

            # Return if arrives to the parent
            # Else create a parent and add parent/child
            if parent_node is not None:
                parent_node.add_child(child_node)
                child_node.add_parent(parent_node)
                return
            else:
                grand_parent_callpath = parent_callpath[:-1]
                parent_info = parent_callpath[-1]
                parent_name = ""

                if " C " in parent_info:
                    parent_name = _get_name_file_module(
                        True, parent_info, " C ")[0]
                elif " [@] " in parent_info:
                    parent_name = _get_name_file_module(
                        True, parent_info, " [@] ")[0]
                else:
                    parent_name = _get_name_file_module(True, parent_info,
                                                        "")[0]

                parent_node = Node(
                    Frame({
                        "type": "function",
                        "name": parent_name
                    }), None)
                self.callpath_to_node[parent_callpath] = parent_node

                parent_node.add_child(child_node)
                child_node.add_parent(parent_node)
                _create_parent(parent_node, grand_parent_callpath)

        def _construct_column_list(first_rank_filenames):
            """This function constructs columns, exc_metrics, and
            inc_metrics using all metric files of a rank. It gets the
            all metric files of a rank as a tuple and only loads the
            second line (metadata) of these files.
            """
            columns = []
            for file_index in range(len(first_rank_filenames)):
                with open(first_rank_filenames[file_index], "r") as f:
                    # Skip the first line: "192 templated_functions_MULTI_TIME"
                    next(f)
                    # No need to check if the metadata is the same for all metric files.
                    metadata = next(f)

                    # Get first three columns from # Name Calls Subrs Excl Incl ProfileCalls #
                    # ProfileCalls is removed since it is is typically set to 0 and not used.
                    # We only do this once since these column names are the same for all files.
                    if file_index == 0:
                        columns.extend(
                            re.match(r"\#\s(.*)\s\#",
                                     metadata).group(1).split(" ")[:-3])

                    # Example metric_name: "PAPI_L2_TCM"
                    # TODO: Decide if Calls and Subrs should be inc or exc metrics
                    metric_name = re.search(r"<value>(.*?)<\/value>",
                                            metadata).group(1)
                    if metric_name == "CPU_TIME" or metric_name == "TIME":
                        metric_name = "time"
                    elif metric_name == "Name":
                        metric_name == "name"
                    columns.extend([metric_name, metric_name + " (inc)"])
                    self.exc_metrics.append(metric_name)
                    self.inc_metrics.append(metric_name + " (inc)")
            return columns

        # dirpath -> returns path of a directory, string
        # dirnames -> returns directory names, list
        # files -> returns filenames in a directory, list
        profile_filenames = []
        for dirpath, dirnames, files in os.walk(self.dirname):
            profiles_in_dir = glob.glob(dirpath + "/profile.*")
            if profiles_in_dir:
                # sort input files in each directory in the same order
                profile_filenames.append(sorted(profiles_in_dir))

        # Store all files in a list of tuples.
        # Each tuple stores all the metric files of a rank.
        # We process one rank at a time.
        # Example: [(metric1/profile.x.0.0, metric2/profile.x.0.0), ...]
        profile_filenames = list(zip(*profile_filenames))

        # Get column information from the metric files of a rank.
        self.columns = _construct_column_list(profile_filenames[0])

        list_roots = []
        prev_rank, prev_thread = 0, 0
        # Example filenames_per_rank: (metric1/profile.x.0.0 ...)
        for filenames_per_rank in profile_filenames:
            file_info = filenames_per_rank[0].split(".")
            rank, thread = int(file_info[-3]), int(file_info[-1])
            if not self.multiple_ranks:
                self.multiple_ranks = True if rank != prev_rank else False
            if not self.multiple_threads:
                self.multiple_threads = True if thread != prev_thread else False

            # Load all files represent a different metric for a rank or a thread.
            # If there are 2 metrics, load metric1\profile.x.0.0 and metric2\profile.x.0.0
            file_data = []
            for f_index in range(len(filenames_per_rank)):
                # Store the lines after metadata.
                file_data.append(
                    open(filenames_per_rank[f_index], "r").readlines()[2:])

            # Get the root information from only the first file to compare them
            # with others.
            # Example: ".TAU application" 1 1 272 15755429 0 GROUP="TAU_DEFAULT"
            root_line = re.match(r"\"(.*)\"\s(.*)\sG", file_data[0][0])
            root_name = root_line.group(1).strip(" ")
            # convert it to a tuple to use it as a key in callpath_to_node dictionary
            root_callpath = tuple([root_name])
            root_values = list(map(int, root_line.group(2).split(" ")[:-1]))

            # After first profile.0.0.0, only get Excl and Incl metric values
            # from other files since other columns will be the same.
            # We assume each metric file of a rank has the same root.
            first_file_root_name = re.search(r"\"(.*?)\"",
                                             file_data[0][0]).group(1)
            for f_index in range(1, len(file_data)):
                root_name = re.search(r"\"(.*?)\"",
                                      file_data[f_index][0]).group(1)
                # Below assert statement throws an error if the roots are not the
                # same for different metric files.
                # TODO: We need to find a solution if this throws an error.
                assert first_file_root_name == root_name, (
                    "Metric files for a rank has different roots.\n" +
                    "File: " + filenames_per_rank[f_index] + "\nLine: 2")
                root_line = re.match(r"\"(.*)\"\s(.*)\sG",
                                     file_data[f_index][0])
                root_values.extend(
                    list(map(int,
                             root_line.group(2).split(" ")[2:4])))

            # Check if the root exists in other ranks.
            # Note that we assume the root is the same for all metric files of a rank.
            if root_callpath not in self.callpath_to_node:
                # Create the root node since it doesn't exist
                root_node = Node(
                    Frame({
                        "name": root_name,
                        "type": "function"
                    }), None)

                # Store callpaths to identify nodes
                self.callpath_to_node[root_callpath] = root_node
                list_roots.append(root_node)
            else:
                # Don't create a new node since it is created earlier
                root_node = self.callpath_to_node.get(root_callpath)

            node_dict = self.create_node_dict(
                root_node,
                self.columns,
                root_values,
                root_name,
                None,
                None,
                0,
                0,
                rank,
                thread,
            )
            self.node_dicts.append(node_dict)

            # Start from the line after root.
            # Iterate over only the first metric file of a rank
            # since the lines should be exactly the same across
            # all metric files of a rank.
            # Uses the same "line_index" for other metric files of a rank.
            for line_index in range(1, len(file_data[0])):
                line = file_data[0][line_index]
                metric_values = []
                # We only parse the lines that has "=>" symbol which shows the callpath info.
                # We just skip the other lines.
                if "=>" in line:
                    # Example: ".TAU application  => foo()  => bar()" 31 0 155019 155019 0 GROUP="TAU_SAMPLE|TAU_CALLPATH"
                    callpath_line_regex = re.match(r"\"(.*)\"\s(.*)\sG", line)
                    # callpath: ".TAU application  => foo()  => bar()"
                    callpath = [
                        name.strip(" ")
                        for name in callpath_line_regex.group(1).split("=>")
                    ]

                    # Example leaf_name: StrToInt [{lulesh-util.cc} {13,1}-{29,1}]
                    leaf_name = callpath[-1]
                    callpath = tuple(callpath)
                    parent_callpath = callpath[:-1]
                    # Don't include the value for ProfileCalls.
                    # metric_values: 31 0 155019 155019
                    metric_values = list(
                        map(float,
                            callpath_line_regex.group(2).split(" ")[:-1]))

                    # Get start and end line information
                    leaf_line_numbers = _get_line_numbers(leaf_name)
                    # Get name, file, and module information using the leaf name
                    # and the symbol on it
                    if " C " in leaf_name:
                        leaf_name_file_module = _get_name_file_module(
                            False, leaf_name, " C ")
                    elif " [@] " in leaf_name:
                        leaf_name_file_module = _get_name_file_module(
                            False, leaf_name, " [@] ")
                    else:
                        leaf_name_file_module = _get_name_file_module(
                            False, leaf_name, "")

                    # Example: ".TAU application  => foo()  => bar()" 31 0 155019..."
                    first_file_callpath_line = re.search(
                        r"\"(.*?)\"", file_data[0][line_index]).group(1)
                    # After first profile.x.0.0, only get Excl and Incl metric values
                    # from other files.
                    for f_index in range(1, len(file_data)):
                        other_file_callpath_line = re.search(
                            r"\"(.*?)\"",
                            file_data[f_index][line_index]).group(1)
                        # We assume metric files of a rank should have the exact same lines.
                        # Only difference should be the Incl and Excl metric values.
                        # TODO: We should find a solution if this raises an error.
                        assert first_file_callpath_line == other_file_callpath_line, (
                            "Lines across metric files for a rank are not the same.\n"
                            + "File: " + filenames_per_rank[f_index] +
                            "\nLine: " + str(line_index + 3))
                        # Get the information from the same line in each file. "line_index".
                        callpath_line_regex = re.match(
                            r"\"(.*)\"\s(.*)\sG",
                            file_data[f_index][line_index])
                        metric_values.extend(
                            map(float,
                                callpath_line_regex.group(2).split(" ")[2:4]))

                    leaf_node = self.callpath_to_node.get(callpath)
                    # Check if that node is created earlier
                    if leaf_node is None:
                        # Create the node since it doesn't exist
                        leaf_node = Node(
                            Frame({
                                "type": "function",
                                "name": leaf_name_file_module[0]
                            }),
                            None,
                        )
                        self.callpath_to_node[callpath] = leaf_node

                        # Get its parent from its callpath.
                        parent_node = self.callpath_to_node.get(
                            parent_callpath)
                        if parent_node is None:
                            # Create parent if it doesn't exist.
                            _create_parent(leaf_node, parent_callpath)
                        else:
                            parent_node.add_child(leaf_node)
                            leaf_node.add_parent(parent_node)

                    node_dict = self.create_node_dict(
                        leaf_node,
                        self.columns,
                        metric_values,
                        # name
                        leaf_name_file_module[0],
                        # file
                        leaf_name_file_module[1],
                        # module
                        leaf_name_file_module[2],
                        # start line
                        leaf_line_numbers[0],
                        # end line
                        leaf_line_numbers[1],
                        rank,
                        thread,
                    )

                    self.node_dicts.append(node_dict)

        return list_roots
예제 #28
0
    def read(self):
        """Read the caliper JSON file to extract the calling context tree."""
        with self.timer.phase("read json"):
            self.read_json_sections()

        with self.timer.phase("graph construction"):
            list_roots = self.create_graph()

        # create a dataframe of metrics from the data section
        self.df_json_data = pd.DataFrame(self.json_data,
                                         columns=self.json_cols)

        # map non-numeric columns to their mappings in the nodes section
        for idx, item in enumerate(self.json_cols_mdata):
            if item["is_value"] is False and self.json_cols[
                    idx] != self.nid_col_name:
                if self.json_cols[idx] == "sourceloc#cali.sampler.pc":
                    # split source file and line number into two columns
                    self.df_json_data["file"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(1))
                    self.df_json_data["line"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(2))
                    self.df_json_data.drop(self.json_cols[idx],
                                           axis=1,
                                           inplace=True)
                    sourceloc_idx = idx
                else:
                    self.df_json_data[self.json_cols[idx]] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: self.json_nodes[x]["label"])

        # since we split sourceloc, we should update json_cols and
        # json_cols_mdata
        if "sourceloc#cali.sampler.pc" in self.json_cols:
            self.json_cols.pop(sourceloc_idx)
            self.json_cols_mdata.pop(sourceloc_idx)
            self.json_cols.append("file")
            self.json_cols.append("line")
            self.json_cols_mdata.append({"is_value": False})
            self.json_cols_mdata.append({"is_value": False})

        max_nid = self.df_json_data[self.nid_col_name].max()

        if "line" in self.df_json_data.columns:
            # split nodes that have multiple file:line numbers to have a child
            # each with a unique file:line number
            unique_nodes = self.df_json_data.groupby(self.nid_col_name)
            df_concat = [self.df_json_data]

            for nid, super_node in unique_nodes:
                line_groups = super_node.groupby("line")
                # only need to do something if there are more than one
                # file:line number entries for the node
                if len(line_groups.size()) > 1:
                    sn_hnode = self.idx_to_node[nid]["node"]

                    for line, line_group in line_groups:
                        # create the node label
                        file_path = (line_group.head(1))["file"].item()
                        file_name = os.path.basename(file_path)
                        node_label = file_name + ":" + line

                        # create a new hatchet node
                        max_nid += 1
                        idx = max_nid
                        hnode = Node(
                            Frame({
                                "type": "statement",
                                "file": file_path,
                                "line": line
                            }),
                            sn_hnode,
                        )
                        sn_hnode.add_child(hnode)

                        node_dict = {
                            self.nid_col_name: idx,
                            "name": node_label,
                            "node": hnode,
                        }
                        self.idx_to_node[idx] = node_dict

                        # change nid of the original node to new node in place
                        for index, row in line_group.iterrows():
                            self.df_json_data.loc[index, "nid"] = max_nid

                    # add new row for original node
                    node_copy = super_node.head(1).copy()
                    for cols in self.metric_columns:
                        node_copy[cols] = 0
                    df_concat.append(node_copy)

            # concatenate all the newly created dataframes with
            # self.df_json_data
            self.df_fixed_data = pd.concat(df_concat)
        else:
            self.df_fixed_data = self.df_json_data

        # create a dataframe with all nodes in the call graph
        self.df_nodes = pd.DataFrame.from_dict(
            data=list(self.idx_to_node.values()))

        # add missing intermediate nodes to the df_fixed_data dataframe
        if "rank" in self.json_cols:
            self.num_ranks = self.df_fixed_data["rank"].max() + 1
            rank_list = range(0, self.num_ranks)

        # create a standard dict to be used for filling all missing rows
        default_metric_dict = {}
        for idx, item in enumerate(self.json_cols_mdata):
            if self.json_cols[idx] != self.nid_col_name:
                if item["is_value"] is True:
                    default_metric_dict[self.json_cols[idx]] = 0
                else:
                    default_metric_dict[self.json_cols[idx]] = None

        # create a list of dicts, one dict for each missing row
        missing_nodes = []
        for iteridx, row in self.df_nodes.iterrows():
            # check if df_nodes row exists in df_fixed_data
            metric_rows = self.df_fixed_data.loc[self.df_fixed_data[
                self.nid_col_name] == row[self.nid_col_name]]
            if "rank" not in self.json_cols:
                if metric_rows.empty:
                    # add a single row
                    node_dict = dict(default_metric_dict)
                    node_dict[self.nid_col_name] = row[self.nid_col_name]
                    missing_nodes.append(node_dict)
            else:
                if metric_rows.empty:
                    # add a row per MPI rank
                    for rank in rank_list:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)
                elif len(metric_rows) < self.num_ranks:
                    # add a row for each missing MPI rank
                    present_ranks = metric_rows["rank"].values
                    missing_ranks = [
                        x for x in rank_list if x not in present_ranks
                    ]
                    for rank in missing_ranks:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)

        self.df_missing = pd.DataFrame.from_dict(data=missing_nodes)
        self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing])

        # create a graph object once all the nodes have been added
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        # merge the metrics and node dataframes on the idx column
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics,
                                 self.df_nodes,
                                 on=self.nid_col_name)
            # set the index to be a MultiIndex
            indices = ["node"]
            if "rank" in self.json_cols:
                indices.append("rank")
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
예제 #29
0
    def parse_xml_node(self, xml_node, parent_nid, parent_line, hparent):
        """Parses an XML node and its children recursively."""
        nid = int(xml_node.get("i"))

        global src_file
        xml_tag = xml_node.tag

        if xml_tag == "PF" or xml_tag == "Pr":
            # procedure
            name = self.procedure_names[xml_node.get("n")]
            if parent_line != 0:
                name = str(parent_line) + ":" + name
            src_file = xml_node.get("f")
            line = int(xml_node.get("l"))

            hnode = Node(Frame({"type": "function", "name": name}), hparent)
            node_dict = self.create_node_dict(
                nid,
                hnode,
                name,
                xml_tag,
                self.src_files[src_file],
                line,
                self.load_modules[xml_node.get("lm")],
            )

        elif xml_tag == "L":
            # loop
            src_file = xml_node.get("f")
            line = int(xml_node.get("l"))
            name = ("Loop@" + os.path.basename(self.src_files[src_file]) +
                    ":" + str(line))

            hnode = Node(
                Frame({
                    "type": "loop",
                    "file": self.src_files[src_file],
                    "line": line
                }),
                hparent,
            )
            node_dict = self.create_node_dict(nid, hnode, name, xml_tag,
                                              self.src_files[src_file], line,
                                              None)

        elif xml_tag == "S":
            # statement
            line = int(xml_node.get("l"))
            # this might not be required for resolving conflicts
            name = os.path.basename(self.src_files[src_file]) + ":" + str(line)

            hnode = Node(
                Frame({
                    "type": "statement",
                    "file": self.src_files[src_file],
                    "line": line,
                }),
                hparent,
            )
            node_dict = self.create_node_dict(nid, hnode, name, xml_tag,
                                              self.src_files[src_file], line,
                                              None)

            # when we reach statement nodes, we subtract their exclusive
            # metric values from the parent's values
            for i, column in enumerate(self.metric_columns):
                if "(inc)" not in column and "(I)" not in column:
                    _crm.subtract_exclusive_metric_vals(
                        nid,
                        parent_nid,
                        self.np_metrics.T[i],
                        self.total_execution_threads,
                        self.num_nodes,
                    )

        if xml_tag == "C" or (xml_tag == "Pr" and
                              self.procedure_names[xml_node.get("n")] == ""):
            # do not add a node to the graph if the xml_tag is a callsite
            # or if its a procedure with no name
            # for Prs, the preceding Pr has the calling line number and for
            # PFs, the preceding C has the line number
            line = int(xml_node.get("l"))
            self.parse_xml_children(xml_node, hparent)
        else:
            self.node_dicts.append(node_dict)
            hparent.add_child(hnode)
            self.parse_xml_children(xml_node, hnode)
예제 #30
0
    def read(self):
        """Read the experiment.xml file to extract the calling context tree and create
        a dataframe out of it. Then merge the two dataframes to create the final
        dataframe.

        Return:
            (GraphFrame): new GraphFrame with HPCToolkit data.
        """
        with self.timer.phase("fill tables"):
            self.fill_tables()

        with self.timer.phase("read metric db"):
            self.read_all_metricdb_files()

        list_roots = []

        # parse the ElementTree to generate a calling context tree
        for root in self.callpath_profile.findall("PF"):
            global src_file

            nid = int(root.get("i"))
            src_file = root.get("f")

            # start with the root and create the callpath and node for the root
            # also a corresponding node_dict to be inserted into the dataframe
            graph_root = Node(
                Frame({
                    "type": "function",
                    "name": self.procedure_names[root.get("n")]
                }),
                None,
            )
            node_dict = self.create_node_dict(
                nid,
                graph_root,
                self.procedure_names[root.get("n")],
                "PF",
                self.src_files[src_file],
                int(root.get("l")),
                self.load_modules[root.get("lm")],
            )

            self.node_dicts.append(node_dict)
            list_roots.append(graph_root)

            # start graph construction at the root
            with self.timer.phase("graph construction"):
                self.parse_xml_children(root, graph_root)

            # put updated metrics back in dataframe
            for i, column in enumerate(self.metric_columns):
                if "(inc)" not in column and "(I)" not in column:
                    self.df_metrics[column] = self.np_metrics.T[i]

        with self.timer.phase("graph construction"):
            graph = Graph(list_roots)
            graph.enumerate_traverse()

        # create a dataframe for all the nodes in the graph
        self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts)

        # merge the metrics and node dataframes
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid")

            # set the index to be a MultiIndex
            if self.num_threads_per_rank > 1:
                indices = ["node", "rank", "thread"]
            # if number of threads per rank is 1, do not make thread an index
            elif self.num_threads_per_rank == 1:
                indices = ["node", "rank"]
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column or "(I)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)