示例#1
0
    def test_index_name_nodes(self):
        df = pd.DataFrame({
            'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
            'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
            'baz': [1, 2, 3, 4, 5, 6]
        })
        output = df.pivot(index='foo', columns='bar', values='baz')

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False

        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df], output)
        index_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.INDEX_NAME
        ]
        column_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.COL_INDEX_NAME
        ]

        self.assertEqual(len(index_name_nodes), 1)
        self.assertEqual(len(column_name_nodes), 1)
示例#2
0
    def test_groupby_has_artifacts(self):
        df = pd.DataFrame([[5, 2], [2, 3], [2, 0]], columns=["A", "B"])
        output = df.groupby(by="A")

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True

        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df], output)

        index_type_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.INDEX
        ]
        column_type_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.COLUMN
        ]

        self.assertEqual(len(index_type_nodes), 6)
        self.assertEqual(len(column_type_nodes), 6)
示例#3
0
    def test_no_spurious_for_list_arg(self):
        df = pd.DataFrame([[5, 2], [2, 3], [2, 0]], columns=["A", "B"])

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True

        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df, [1, 3, 4]], df)

        index_type_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.INDEX
        ]
        column_type_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.COLUMN
        ]

        self.assertEqual(len(index_type_nodes), 6)
        self.assertEqual(len(column_type_nodes), 4)
示例#4
0
    def test_index_name_equality_edges(self):
        df = pd.DataFrame({
            'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
            'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
            'baz': [1, 2, 3, 4, 5, 6]
        })
        output = df.pivot(index='foo', columns='bar', values='baz')

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = False
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df], output)
        inp_col_nodes = [
            node for node in rel_graph.nodes if
            node.ntype == GraphNodeType.COLUMN and node.source.startswith("I")
        ]
        out_idx_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.INDEX_NAME
            and node.source.startswith("O")
        ]
        out_col_idx_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.COL_INDEX_NAME
            and node.source.startswith("O")
        ]

        def check_edge_exists(in_node: GraphNode, out_node: GraphNode,
                              graph: RelationGraph):
            for e in graph.edges:
                if (e.node1 == in_node
                        and e.node2 == out_node) or (e.node1 == out_node
                                                     and e.node2 == in_node):
                    return True

            return False

        inp_foo_node = [i for i in inp_col_nodes
                        if i.identifier == '[-1,0]'][0]
        inp_bar_node = [i for i in inp_col_nodes
                        if i.identifier == '[-1,1]'][0]
        out_foo_node = [
            i for i in out_idx_name_nodes if i.identifier == '[-1,-1]'
        ][0]
        out_bar_node = [
            i for i in out_col_idx_name_nodes if i.identifier == '[-1,-1]'
        ][0]

        self.assertTrue(
            check_edge_exists(inp_foo_node, out_foo_node, rel_graph))
        self.assertTrue(
            check_edge_exists(inp_bar_node, out_bar_node, rel_graph))
示例#5
0
 def __init__(self, obj: pd.DataFrame, type_str: str, source: str,
              options: GraphOptions, **kwargs):
     options = copy.copy(options)
     options.INDEX_NODES = False
     options.COLUMN_NODES = False
     options.INDEX_NAME_NODES = False
     options.INDEX_EDGES = False
     options.INDEX_NAME_EDGES = False
     super().__init__(obj, type_str, source, options, **kwargs)
示例#6
0
    def test_index_name_nodes_multiindex(self):
        df = pd.DataFrame(
            [(389.0, 'fly'), (24.0, 'fly'), (80.5, 'run'), (np.nan, 'jump')],
            index=pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                             ('bird', 'parrot'),
                                             ('mammal', 'lion'),
                                             ('mammal', 'monkey')],
                                            names=['class', 'name']),
            columns=pd.MultiIndex.from_tuples([('speed', 'max'),
                                               ('species', 'type')]))
        df.columns.names = ['name1', 'name2']

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False

        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df], df)
        index_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.INDEX_NAME
        ]
        column_name_nodes = [
            node for node in rel_graph.nodes
            if node.ntype == GraphNodeType.COL_INDEX_NAME
        ]

        self.assertEqual(len(index_name_nodes),
                         4)  # Both in the input and output, so x2
        self.assertEqual(len(column_name_nodes),
                         4)  # Both in the input and output, so x2
示例#7
0
    def test_column_multi(self):
        column_labels = [['bar', 'bar', 'baz', 'baz'],
                         ['one', 'two', 'one', 'two']]
        tuples = list(zip(*column_labels))
        col_index = pd.MultiIndex.from_tuples(tuples)
        data = [[0, 1, 2, 3], [4, 5, 6, 7]]
        input_df = pd.DataFrame(data, columns=col_index)
        #   bar     baz
        #   one two one two
        # 0   0   1   2   3
        # 1   4   5   6   7
        output_df = input_df.stack().reset_index()
        #    level_0 level_1  bar  baz
        # 0        0     one    0    2
        # 1        0     two    1    3
        # 2        1     one    4    6
        # 3        1     two    5    7

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([input_df], output_df)
        rel_graph_edges = rel_graph.edges

        col_nodes = [
            [
                GraphNode("I0", '[-2,0]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,1]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,2]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,3]', GraphNodeType.COLUMN)
            ],
            [
                GraphNode("I0", '[-1,0]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,1]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,2]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,3]', GraphNodeType.COLUMN)
            ],
        ]

        adjacency_edges = [
            GraphEdge(col_nodes[0][0], col_nodes[1][0],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][0], col_nodes[0][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][0], col_nodes[1][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][1], col_nodes[1][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][1], col_nodes[1][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][1], col_nodes[0][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][2], col_nodes[1][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][2], col_nodes[0][3],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][2], col_nodes[1][3],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][3], col_nodes[1][3],
                      GraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        # indexing edges
        input_coli_elems = [[
            GraphNode("I0", '[0,0]', GraphNodeType.INT),
            GraphNode("I0", '[1,0]', GraphNodeType.INT)
        ],
                            [
                                GraphNode("I0", '[0,1]', GraphNodeType.INT),
                                GraphNode("I0", '[1,1]', GraphNodeType.INT)
                            ],
                            [
                                GraphNode("I0", '[0,2]', GraphNodeType.INT),
                                GraphNode("I0", '[1,2]', GraphNodeType.INT)
                            ],
                            [
                                GraphNode("I0", '[0,3]', GraphNodeType.INT),
                                GraphNode("I0", '[1,3]', GraphNodeType.INT)
                            ]]

        def check_edges(in_nodes, out_nodes, edge_type):
            for in_node in in_nodes:
                for out_node in out_nodes:
                    edge = GraphEdge(in_node, out_node, edge_type)
                    self.assertTrue(
                        edge in rel_graph_edges,
                        "Could not find edge %s in set of edges:\n%s" %
                        (edge, rel_graph_edges))

        for i in range(4):
            in_nodes = [col_nodes[0][i], col_nodes[1][i]]
            out_nodes = input_coli_elems[i]
            check_edges(in_nodes, out_nodes, GraphEdgeType.INDEX)

        # equality_edges
        bars = [col_nodes[0][0], col_nodes[0][1]]
        bazs = [col_nodes[0][2], col_nodes[0][3]]
        ones = [col_nodes[1][0], col_nodes[1][2]]
        twos = [col_nodes[1][1], col_nodes[1][3]]

        out_01 = GraphNode("O0", '[0,1]', GraphNodeType.STR)
        out_11 = GraphNode("O0", '[1,1]', GraphNodeType.STR)
        out_21 = GraphNode("O0", '[2,1]', GraphNodeType.STR)
        out_31 = GraphNode("O0", '[3,1]', GraphNodeType.STR)

        out_col_2 = GraphNode("O0", '[-1,2]', GraphNodeType.COLUMN)
        out_col_3 = GraphNode("O0", '[-1,3]', GraphNodeType.COLUMN)

        check_edges(bars, [out_col_2], GraphEdgeType.EQUALITY)
        check_edges(bazs, [out_col_3], GraphEdgeType.EQUALITY)

        check_edges(ones, [out_01, out_21], GraphEdgeType.EQUALITY)
        check_edges(twos, [out_11, out_31], GraphEdgeType.EQUALITY)
示例#8
0
    def test_idx_multi(self):
        tuples = [("bar", "one"), ("bar", "two")]
        index = pd.MultiIndex.from_tuples(tuples)
        data = [[0], [1]]
        input_df = pd.DataFrame(data, index=index)
        #          0
        # bar one  0
        #     two  1
        output_df = input_df.unstack()
        #       0
        #     one two
        # bar   0   1
        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([input_df], output_df)
        rel_graph_edges = rel_graph.edges

        bar_in_0 = GraphNode("I0", '[0,-2]', GraphNodeType.INDEX)
        bar_in_1 = GraphNode("I0", '[1,-2]', GraphNodeType.INDEX)
        bar_out = GraphNode("O0", '[0,-1]', GraphNodeType.INDEX)

        one_in = GraphNode("I0", '[0,-1]', GraphNodeType.INDEX)
        two_in = GraphNode("I0", '[1,-1]', GraphNodeType.INDEX)

        one_out = GraphNode("O0", '[-1,0]', GraphNodeType.COLUMN)
        two_out = GraphNode("O0", '[-1,1]', GraphNodeType.COLUMN)

        in_0 = GraphNode("I0", '[0,0]', GraphNodeType.INT)
        in_1 = GraphNode("I0", '[1,0]', GraphNodeType.INT)

        out_0 = GraphNode("O0", '[0,0]', GraphNodeType.INT)
        out_1 = GraphNode("O0", '[0,1]', GraphNodeType.INT)

        adjacency_edges = [
            GraphEdge(bar_in_0, bar_in_1, GraphEdgeType.ADJACENCY),
            GraphEdge(bar_in_0, one_in, GraphEdgeType.ADJACENCY),
            GraphEdge(bar_in_1, two_in, GraphEdgeType.ADJACENCY),
            GraphEdge(one_in, two_in, GraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))
        indexing_edges = [
            GraphEdge(bar_in_0, in_0, GraphEdgeType.INDEX),
            GraphEdge(one_in, in_0, GraphEdgeType.INDEX),
            GraphEdge(bar_in_1, in_1, GraphEdgeType.INDEX),
            GraphEdge(two_in, in_1, GraphEdgeType.INDEX),
            GraphEdge(bar_out, out_0, GraphEdgeType.INDEX),
            GraphEdge(bar_out, out_1, GraphEdgeType.INDEX)
        ]

        for edge in indexing_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        equality_edges = [
            GraphEdge(bar_in_0, bar_out, GraphEdgeType.EQUALITY),
            GraphEdge(bar_in_1, bar_out, GraphEdgeType.EQUALITY),
            GraphEdge(one_in, one_out, GraphEdgeType.EQUALITY),
            GraphEdge(two_in, two_out, GraphEdgeType.EQUALITY)
        ]

        for edge in equality_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))