示例#1
0
    def get_node(self, val: Any, identifier='', options=None):
        if options is None:
            options = self.options

        return GraphNode.from_obj(val,
                                  source=self.source,
                                  identifier=identifier,
                                  options=options)
示例#2
0
    def setup_representor(self, collector: EdgeCollection):
        if self.representor is not None:
            return self.representor

        self.representor = GraphNode(self.source, '',
                                     GraphNodeType.REPRESENTOR)
        for n in self.nodes:
            collector.add_edge(self.representor, n, GraphEdgeType.REPRESENTOR)
            collector.add_edge(n, self.representor, GraphEdgeType.REPRESENTED)

        self.add_node(self.representor)
        return self.representor
示例#3
0
    def test_groupby_input(self):
        df = pd.DataFrame({
            "Name": ["Alice", "Bob", "Mallory", "Mallory", "Bob", "Mallory"],
            "City": [
                "Seattle", "Seattle", "Portland", "Seattle", "Seattle",
                "Portland"
            ]
        })
        input_ = df.groupby("Name")
        output = input_.count().reset_index()
        options = GraphOptions()
        options.NODE_TYPES = True
        options.ADJACENCY_EDGES = False
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([input_], output)
        rel_graph_edges = rel_graph.edges
        alice_nodes_in = [GraphNode("I0_0", '[0,0]', GraphNodeType.STR)]

        alice_nodes_out = [GraphNode("O0", '[0,0]', GraphNodeType.STR)]

        bob_nodes_in = [
            GraphNode("I0_1", '[0,0]', GraphNodeType.STR),
            GraphNode("I0_1", '[1,0]', GraphNodeType.STR)
        ]
        bob_nodes_out = [GraphNode("O0", '[1,0]', GraphNodeType.STR)]

        mallory_nodes_in = [
            GraphNode("I0_2", '[0,0]', GraphNodeType.STR),
            GraphNode("I0_2", '[1,0]', GraphNodeType.STR),
            GraphNode("I0_2", '[2,0]', GraphNodeType.STR)
        ]
        mallory_nodes_out = [GraphNode("O0", '[2,0]', GraphNodeType.STR)]

        def check_edges(in_nodes, out_nodes):
            for in_node in in_nodes:
                for out_node in out_nodes:
                    edge = GraphEdge(in_node, out_node, GraphEdgeType.EQUALITY)
                    self.assertTrue(
                        edge in rel_graph_edges,
                        "Could not find edge %s in set of edges:\n%s" %
                        (edge, rel_graph_edges))

        check_edges(alice_nodes_in, alice_nodes_out)
        check_edges(bob_nodes_in, bob_nodes_out)
        check_edges(mallory_nodes_in, mallory_nodes_out)
示例#4
0
 def test_basic_max(self):
     input_df = pd.DataFrame([[1, 2], [2, 3], [2, 0]])
     input_00 = GraphNode("I0", '[0,0]', get_node_type(input_df.iat[0, 0]))
     input_01 = GraphNode("I0", '[0,1]', get_node_type(input_df.iat[0, 1]))
     input_10 = GraphNode("I0", '[1,0]', get_node_type(input_df.iat[1, 0]))
     input_11 = GraphNode("I0", '[1,1]', get_node_type(input_df.iat[1, 1]))
     input_20 = GraphNode("I0", '[2,0]', get_node_type(input_df.iat[2, 0]))
     input_21 = GraphNode("I0", '[2,1]', get_node_type(input_df.iat[2, 1]))
     output_df = pd.DataFrame([[2, 3]])
     output_00 = GraphNode("O0", '[0,0]', get_node_type(output_df.iat[0,
                                                                      0]))
     output_01 = GraphNode("O0", '[0,1]', get_node_type(output_df.iat[0,
                                                                      1]))
     options = GraphOptions()
     options.NODE_TYPES = True
     rel_graph: RelationGraph = RelationGraph(options)
     rel_graph.from_input_output([input_df], output_df)
     rel_graph_edges = rel_graph.edges
     #  positional edges
     positional_edges = [
         GraphEdge(input_00, input_01, GraphEdgeType.ADJACENCY),
         GraphEdge(input_00, input_10, GraphEdgeType.ADJACENCY),
         GraphEdge(input_10, input_11, GraphEdgeType.ADJACENCY),
         GraphEdge(input_10, input_20, GraphEdgeType.ADJACENCY),
         GraphEdge(input_20, input_21, GraphEdgeType.ADJACENCY),
         GraphEdge(input_01, input_11, GraphEdgeType.ADJACENCY),
         GraphEdge(input_11, input_21, GraphEdgeType.ADJACENCY),
         GraphEdge(output_00, output_01, GraphEdgeType.ADJACENCY)
     ]
     for edge in positional_edges:
         self.assertTrue(
             edge in rel_graph_edges,
             "Could not find edge %s in set of edges:\n%s" %
             (edge, rel_graph_edges))
     #  equality edges
     equality_edges = [
         GraphEdge(input_10, output_00, GraphEdgeType.EQUALITY),
         GraphEdge(input_20, output_00, GraphEdgeType.EQUALITY),
         GraphEdge(input_01, output_00,
                   GraphEdgeType.EQUALITY),  # redundant
         GraphEdge(input_11, output_01, GraphEdgeType.EQUALITY)
     ]
     for edge in equality_edges:
         self.assertTrue(
             edge in rel_graph_edges,
             "Could not find edge %s in set of edges:\n%s" %
             (edge, rel_graph_edges))
    def get_encoding(self, get_mapping=False, get_reverse_mapping=False):
        encoding, node_to_int = super().get_encoding(get_mapping=True)
        encoding['operator'] = 'OrderedSubsets'
        encoding['elements'] = [node_to_int[x] for x in self.elements]
        reverse_mapping = {
            node_to_int[x]: idx
            for idx, x in enumerate(self.elements)
        }
        if self.selected_nodes is not None:
            encoding['selected'] = [
                node_to_int[x] for x in self.selected_nodes
            ]

        #  Create a terminal token and add it
        terminal_node = GraphNode(source='T',
                                  identifier='',
                                  ntype=GraphNodeType.TERMINAL)
        node_to_int[terminal_node] = max(node_to_int.values()) + 1
        encoding['elements'].append(node_to_int[terminal_node])
        encoding['terminal'] = node_to_int[terminal_node]
        encoding['node_features'].append(terminal_node.get_encoding())
        reverse_mapping[node_to_int[terminal_node]] = len(self.elements)

        if self.selected_nodes is not None:
            encoding['selected'].append(node_to_int[terminal_node])

        if get_mapping:
            if get_reverse_mapping:
                return encoding, reverse_mapping, node_to_int

            return encoding, node_to_int

        if get_reverse_mapping:
            return encoding, reverse_mapping

        return encoding
示例#6
0
 def test_dict(self):
     input_df = pd.DataFrame([[1, 2], [3, 4]])
     input_00 = GraphNode("I0", '[0,0]', get_node_type(input_df.iat[0, 0]))
     input_01 = GraphNode("I0", '[0,1]', get_node_type(input_df.iat[0, 1]))
     input_10 = GraphNode("I0", '[1,0]', get_node_type(input_df.iat[1, 0]))
     input_11 = GraphNode("I0", '[1,1]', get_node_type(input_df.iat[1, 1]))
     output = {"A": [1, 3], "B": [2, 4]}
     output_00 = GraphNode("O0", '[0,0]', get_node_type(output['A'][0]))
     output_01 = GraphNode("O0", '[0,1]', get_node_type(output['B'][0]))
     output_10 = GraphNode("O0", '[1,0]', get_node_type(output['A'][1]))
     output_11 = GraphNode("O0", '[1,1]', get_node_type(output['B'][1]))
     options = GraphOptions()
     options.NODE_TYPES = True
     rel_graph: RelationGraph = RelationGraph(options)
     rel_graph.from_input_output([input_df], output)
     rel_graph_edges = rel_graph.edges
     positional_edges = [
         GraphEdge(input_00, input_01, GraphEdgeType.ADJACENCY),
         GraphEdge(input_00, input_10, GraphEdgeType.ADJACENCY),
         GraphEdge(input_10, input_11, GraphEdgeType.ADJACENCY),
         GraphEdge(input_01, input_11, GraphEdgeType.ADJACENCY),
         GraphEdge(output_00, output_01, GraphEdgeType.ADJACENCY),
         GraphEdge(output_00, output_10, GraphEdgeType.ADJACENCY),
         GraphEdge(output_10, output_11, GraphEdgeType.ADJACENCY),
         GraphEdge(output_01, output_11, GraphEdgeType.ADJACENCY)
     ]
     for edge in positional_edges:
         self.assertTrue(
             edge in rel_graph_edges,
             "Could not find edge %s in set of edges:\n%s" %
             (edge, rel_graph_edges))
     equality_edges = [
         GraphEdge(input_00, output_00, GraphEdgeType.EQUALITY),
         GraphEdge(input_10, output_10, GraphEdgeType.EQUALITY),
         GraphEdge(input_01, output_01, GraphEdgeType.EQUALITY),
         GraphEdge(input_11, output_11, GraphEdgeType.EQUALITY)
     ]
     for edge in equality_edges:
         self.assertTrue(
             edge in rel_graph_edges,
             "Could not find edge %s in set of edges:\n%s" %
             (edge, rel_graph_edges))
示例#7
0
    def test_substr_edges(self):
        df = pd.DataFrame({
            'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
            'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
            'baz': [1, 2, 3, 4, 5, 6]
        })
        out = pd.DataFrame({
            "mrr": ["wo", "no"],
            'asdasd': ["A_1", "B_4"],
            'nostr': [33, 12]
        })

        options = GraphOptions()
        options.SUBSTR_EDGES = True

        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([df], out)

        def check_edges(in_nodes, out_nodes):
            for in_node in in_nodes:
                for out_node in out_nodes:
                    edge = GraphEdge(in_node, out_node, GraphEdgeType.SUBSTR)
                    self.assertTrue(
                        edge in rel_graph.edges,
                        "Could not find edge %s in set of edges:\n%s" %
                        (edge, rel_graph.edges))

        # test substrings from out to in
        two_nodes = [
            GraphNode("I0", '[3,0]', GraphNodeType.STR),
            GraphNode("I0", '[4,0]', GraphNodeType.STR),
            GraphNode("I0", '[5,0]', GraphNodeType.STR)
        ]
        wo_node = GraphNode("O0", '[0,0]', GraphNodeType.STR)

        check_edges(two_nodes, [wo_node])

        # test substrings from in to out
        A_in = [
            GraphNode("I0", '[0,1]', GraphNodeType.STR),
            GraphNode("I0", '[3,1]', GraphNodeType.STR)
        ]
        A_out = [GraphNode("O0", '[0,1]', GraphNodeType.STR)]
        B_in = [
            GraphNode("I0", '[1,1]', GraphNodeType.STR),
            GraphNode("I0", '[4,1]', GraphNodeType.STR)
        ]
        B_out = [GraphNode("O0", '[1,1]', GraphNodeType.STR)]
        check_edges(A_in, A_out)
        check_edges(B_in, B_out)

        # test substrings involving non-strings
        one_in = [
            GraphNode("I0", '[0,2]', GraphNodeType.INT),
            GraphNode("I0", '[1,-1]', GraphNodeType.INDEX)
        ]
        one_out = [GraphNode("O0", '[0,1]', GraphNodeType.STR)]
        four_in = [
            GraphNode("I0", '[3,2]', GraphNodeType.INT),
            GraphNode("I0", '[4,-1]', GraphNodeType.INDEX)
        ]
        four_out = [GraphNode("O0", '[1,1]', GraphNodeType.STR)]
        check_edges(one_in, one_out)
        check_edges(four_in, four_out)

        # test nothing else
        self.assertEqual(
            11,
            len([
                e for e in rel_graph.edges if e.etype == GraphEdgeType.SUBSTR
            ]))
示例#8
0
    def test_column_multi(self):
        column_labels = [['bar', 'bar', 'baz', 'baz'],
                         ['one', 'two', 'one', 'two']]
        tuples = list(zip(*column_labels))
        col_index = pd.MultiIndex.from_tuples(tuples)
        data = [[0, 1, 2, 3], [4, 5, 6, 7]]
        input_df = pd.DataFrame(data, columns=col_index)
        #   bar     baz
        #   one two one two
        # 0   0   1   2   3
        # 1   4   5   6   7
        output_df = input_df.stack().reset_index()
        #    level_0 level_1  bar  baz
        # 0        0     one    0    2
        # 1        0     two    1    3
        # 2        1     one    4    6
        # 3        1     two    5    7

        options = GraphOptions()
        options.COLUMN_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([input_df], output_df)
        rel_graph_edges = rel_graph.edges

        col_nodes = [
            [
                GraphNode("I0", '[-2,0]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,1]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,2]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-2,3]', GraphNodeType.COLUMN)
            ],
            [
                GraphNode("I0", '[-1,0]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,1]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,2]', GraphNodeType.COLUMN),
                GraphNode("I0", '[-1,3]', GraphNodeType.COLUMN)
            ],
        ]

        adjacency_edges = [
            GraphEdge(col_nodes[0][0], col_nodes[1][0],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][0], col_nodes[0][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][0], col_nodes[1][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][1], col_nodes[1][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][1], col_nodes[1][1],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][1], col_nodes[0][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][2], col_nodes[1][2],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][2], col_nodes[0][3],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[1][2], col_nodes[1][3],
                      GraphEdgeType.ADJACENCY),
            GraphEdge(col_nodes[0][3], col_nodes[1][3],
                      GraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        # indexing edges
        input_coli_elems = [[
            GraphNode("I0", '[0,0]', GraphNodeType.INT),
            GraphNode("I0", '[1,0]', GraphNodeType.INT)
        ],
                            [
                                GraphNode("I0", '[0,1]', GraphNodeType.INT),
                                GraphNode("I0", '[1,1]', GraphNodeType.INT)
                            ],
                            [
                                GraphNode("I0", '[0,2]', GraphNodeType.INT),
                                GraphNode("I0", '[1,2]', GraphNodeType.INT)
                            ],
                            [
                                GraphNode("I0", '[0,3]', GraphNodeType.INT),
                                GraphNode("I0", '[1,3]', GraphNodeType.INT)
                            ]]

        def check_edges(in_nodes, out_nodes, edge_type):
            for in_node in in_nodes:
                for out_node in out_nodes:
                    edge = GraphEdge(in_node, out_node, edge_type)
                    self.assertTrue(
                        edge in rel_graph_edges,
                        "Could not find edge %s in set of edges:\n%s" %
                        (edge, rel_graph_edges))

        for i in range(4):
            in_nodes = [col_nodes[0][i], col_nodes[1][i]]
            out_nodes = input_coli_elems[i]
            check_edges(in_nodes, out_nodes, GraphEdgeType.INDEX)

        # equality_edges
        bars = [col_nodes[0][0], col_nodes[0][1]]
        bazs = [col_nodes[0][2], col_nodes[0][3]]
        ones = [col_nodes[1][0], col_nodes[1][2]]
        twos = [col_nodes[1][1], col_nodes[1][3]]

        out_01 = GraphNode("O0", '[0,1]', GraphNodeType.STR)
        out_11 = GraphNode("O0", '[1,1]', GraphNodeType.STR)
        out_21 = GraphNode("O0", '[2,1]', GraphNodeType.STR)
        out_31 = GraphNode("O0", '[3,1]', GraphNodeType.STR)

        out_col_2 = GraphNode("O0", '[-1,2]', GraphNodeType.COLUMN)
        out_col_3 = GraphNode("O0", '[-1,3]', GraphNodeType.COLUMN)

        check_edges(bars, [out_col_2], GraphEdgeType.EQUALITY)
        check_edges(bazs, [out_col_3], GraphEdgeType.EQUALITY)

        check_edges(ones, [out_01, out_21], GraphEdgeType.EQUALITY)
        check_edges(twos, [out_11, out_31], GraphEdgeType.EQUALITY)
示例#9
0
    def test_idx_multi(self):
        tuples = [("bar", "one"), ("bar", "two")]
        index = pd.MultiIndex.from_tuples(tuples)
        data = [[0], [1]]
        input_df = pd.DataFrame(data, index=index)
        #          0
        # bar one  0
        #     two  1
        output_df = input_df.unstack()
        #       0
        #     one two
        # bar   0   1
        options = GraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph(options)
        rel_graph.from_input_output([input_df], output_df)
        rel_graph_edges = rel_graph.edges

        bar_in_0 = GraphNode("I0", '[0,-2]', GraphNodeType.INDEX)
        bar_in_1 = GraphNode("I0", '[1,-2]', GraphNodeType.INDEX)
        bar_out = GraphNode("O0", '[0,-1]', GraphNodeType.INDEX)

        one_in = GraphNode("I0", '[0,-1]', GraphNodeType.INDEX)
        two_in = GraphNode("I0", '[1,-1]', GraphNodeType.INDEX)

        one_out = GraphNode("O0", '[-1,0]', GraphNodeType.COLUMN)
        two_out = GraphNode("O0", '[-1,1]', GraphNodeType.COLUMN)

        in_0 = GraphNode("I0", '[0,0]', GraphNodeType.INT)
        in_1 = GraphNode("I0", '[1,0]', GraphNodeType.INT)

        out_0 = GraphNode("O0", '[0,0]', GraphNodeType.INT)
        out_1 = GraphNode("O0", '[0,1]', GraphNodeType.INT)

        adjacency_edges = [
            GraphEdge(bar_in_0, bar_in_1, GraphEdgeType.ADJACENCY),
            GraphEdge(bar_in_0, one_in, GraphEdgeType.ADJACENCY),
            GraphEdge(bar_in_1, two_in, GraphEdgeType.ADJACENCY),
            GraphEdge(one_in, two_in, GraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))
        indexing_edges = [
            GraphEdge(bar_in_0, in_0, GraphEdgeType.INDEX),
            GraphEdge(one_in, in_0, GraphEdgeType.INDEX),
            GraphEdge(bar_in_1, in_1, GraphEdgeType.INDEX),
            GraphEdge(two_in, in_1, GraphEdgeType.INDEX),
            GraphEdge(bar_out, out_0, GraphEdgeType.INDEX),
            GraphEdge(bar_out, out_1, GraphEdgeType.INDEX)
        ]

        for edge in indexing_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        equality_edges = [
            GraphEdge(bar_in_0, bar_out, GraphEdgeType.EQUALITY),
            GraphEdge(bar_in_1, bar_out, GraphEdgeType.EQUALITY),
            GraphEdge(one_in, one_out, GraphEdgeType.EQUALITY),
            GraphEdge(two_in, two_out, GraphEdgeType.EQUALITY)
        ]

        for edge in equality_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))