def check_edges(in_nodes, out_nodes, edge_type): for in_node in in_nodes: for out_node in out_nodes: edge = RelationGraphEdge(in_node, out_node, edge_type) self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges))
def test_max_series(self): input_df = pd.DataFrame([[1, 2], [2, 3], [2, 0]]) input_00 = RelationGraphNode("I0", (0, 0), get_node_type(input_df.iat[0, 0])) input_01 = RelationGraphNode("I0", (0, 1), get_node_type(input_df.iat[0, 1])) input_10 = RelationGraphNode("I0", (1, 0), get_node_type(input_df.iat[1, 0])) input_11 = RelationGraphNode("I0", (1, 1), get_node_type(input_df.iat[1, 1])) input_20 = RelationGraphNode("I0", (2, 0), get_node_type(input_df.iat[2, 0])) input_21 = RelationGraphNode("I0", (2, 1), get_node_type(input_df.iat[2, 1])) output = pd.DataFrame.max(input_df) output_00 = RelationGraphNode("O0", (0, 0), get_node_type(output.iat[0])) output_10 = RelationGraphNode("O0", (1, 0), get_node_type(output.iat[1])) options = RelationGraphOptions() options.NODE_TYPES = True rel_graph: RelationGraph = RelationGraph.build_relation_graph( [input_df], output, options) rel_graph_edges = rel_graph.edges # positional edges positional_edges = [ RelationGraphEdge(input_00, input_01, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_00, input_10, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_10, input_11, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_10, input_20, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_20, input_21, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_01, input_11, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_11, input_21, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(output_00, output_10, RelationGraphEdgeType.ADJACENCY) ] for edge in positional_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) # equality edges equality_edges = [ RelationGraphEdge(input_10, output_00, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(input_20, output_00, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(input_01, output_00, RelationGraphEdgeType.EQUALITY), # redundant RelationGraphEdge(input_11, output_10, RelationGraphEdgeType.EQUALITY) ] for edge in equality_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges))
def test_idx_multi(self): tuples = [("bar", "one"), ("bar", "two")] index = pd.MultiIndex.from_tuples(tuples) data = [[0], [1]] input_df = pd.DataFrame(data, index=index) # 0 # bar one 0 # two 1 output_df = input_df.unstack() # 0 # one two # bar 0 1 options = RelationGraphOptions() options.COLUMN_NODES = True options.INDEX_NODES = True options.ADJACENCY_EDGES = True options.EQUALITY_EDGES = True options.NODE_TYPES = True options.INDEX_EDGES = True rel_graph: RelationGraph = RelationGraph.build_relation_graph( [input_df], output_df, options) rel_graph_edges = rel_graph.edges bar_in_0 = RelationGraphNode("I0", (0, -2), RelationGraphNodeType.INDEX) bar_in_1 = RelationGraphNode("I0", (1, -2), RelationGraphNodeType.INDEX) bar_out = RelationGraphNode("O0", (0, -1), RelationGraphNodeType.INDEX) one_in = RelationGraphNode("I0", (0, -1), RelationGraphNodeType.INDEX) two_in = RelationGraphNode("I0", (1, -1), RelationGraphNodeType.INDEX) one_out = RelationGraphNode("O0", (-1, 0), RelationGraphNodeType.COLUMN) two_out = RelationGraphNode("O0", (-1, 1), RelationGraphNodeType.COLUMN) in_0 = RelationGraphNode("I0", (0, 0), RelationGraphNodeType.INT) in_1 = RelationGraphNode("I0", (1, 0), RelationGraphNodeType.INT) out_0 = RelationGraphNode("O0", (0, 0), RelationGraphNodeType.INT) out_1 = RelationGraphNode("O0", (0, 1), RelationGraphNodeType.INT) adjacency_edges = [ RelationGraphEdge(bar_in_0, bar_in_1, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(bar_in_0, one_in, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(bar_in_1, two_in, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(one_in, two_in, RelationGraphEdgeType.ADJACENCY) ] for edge in adjacency_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) indexing_edges = [ RelationGraphEdge(bar_in_0, in_0, RelationGraphEdgeType.INDEX), RelationGraphEdge(one_in, in_0, RelationGraphEdgeType.INDEX), RelationGraphEdge(bar_in_1, in_1, RelationGraphEdgeType.INDEX), RelationGraphEdge(two_in, in_1, RelationGraphEdgeType.INDEX), RelationGraphEdge(bar_out, out_0, RelationGraphEdgeType.INDEX), RelationGraphEdge(bar_out, out_1, RelationGraphEdgeType.INDEX) ] for edge in indexing_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) equality_edges = [ RelationGraphEdge(bar_in_0, bar_out, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(bar_in_1, bar_out, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(one_in, one_out, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(two_in, two_out, RelationGraphEdgeType.EQUALITY) ] for edge in equality_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges))
def test_column_multi(self): column_labels = [['bar', 'bar', 'baz', 'baz'], ['one', 'two', 'one', 'two']] tuples = list(zip(*column_labels)) col_index = pd.MultiIndex.from_tuples(tuples) data = [[0, 1, 2, 3], [4, 5, 6, 7]] input_df = pd.DataFrame(data, columns=col_index) # bar baz # one two one two # 0 0 1 2 3 # 1 4 5 6 7 output_df = input_df.stack().reset_index() # level_0 level_1 bar baz # 0 0 one 0 2 # 1 0 two 1 3 # 2 1 one 4 6 # 3 1 two 5 7 options = RelationGraphOptions() options.COLUMN_NODES = True options.ADJACENCY_EDGES = True options.EQUALITY_EDGES = True options.NODE_TYPES = True options.INDEX_EDGES = True rel_graph: RelationGraph = RelationGraph.build_relation_graph( [input_df], output_df, options) rel_graph_edges = rel_graph.edges col_nodes = [ [ RelationGraphNode("I0", (-2, 0), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-2, 1), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-2, 2), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-2, 3), RelationGraphNodeType.COLUMN) ], [ RelationGraphNode("I0", (-1, 0), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-1, 1), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-1, 2), RelationGraphNodeType.COLUMN), RelationGraphNode("I0", (-1, 3), RelationGraphNodeType.COLUMN) ], ] adjacency_edges = [ RelationGraphEdge(col_nodes[0][0], col_nodes[1][0], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][0], col_nodes[0][1], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[1][0], col_nodes[1][1], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[1][1], col_nodes[1][2], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][1], col_nodes[1][1], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][1], col_nodes[0][2], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][2], col_nodes[1][2], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][2], col_nodes[0][3], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[1][2], col_nodes[1][3], RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(col_nodes[0][3], col_nodes[1][3], RelationGraphEdgeType.ADJACENCY) ] for edge in adjacency_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) # indexing edges input_coli_elems = [[ RelationGraphNode("I0", (0, 0), RelationGraphNodeType.INT), RelationGraphNode("I0", (1, 0), RelationGraphNodeType.INT) ], [ RelationGraphNode("I0", (0, 1), RelationGraphNodeType.INT), RelationGraphNode("I0", (1, 1), RelationGraphNodeType.INT) ], [ RelationGraphNode("I0", (0, 2), RelationGraphNodeType.INT), RelationGraphNode("I0", (1, 2), RelationGraphNodeType.INT) ], [ RelationGraphNode("I0", (0, 3), RelationGraphNodeType.INT), RelationGraphNode("I0", (1, 3), RelationGraphNodeType.INT) ]] def check_edges(in_nodes, out_nodes, edge_type): for in_node in in_nodes: for out_node in out_nodes: edge = RelationGraphEdge(in_node, out_node, edge_type) self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) for i in range(4): in_nodes = [col_nodes[0][i], col_nodes[1][i]] out_nodes = input_coli_elems[i] check_edges(in_nodes, out_nodes, RelationGraphEdgeType.INDEX) # equality_edges bars = [col_nodes[0][0], col_nodes[0][1]] bazs = [col_nodes[0][2], col_nodes[0][3]] ones = [col_nodes[1][0], col_nodes[1][2]] twos = [col_nodes[1][1], col_nodes[1][3]] out_01 = RelationGraphNode("O0", (0, 1), RelationGraphNodeType.STR) out_11 = RelationGraphNode("O0", (1, 1), RelationGraphNodeType.STR) out_21 = RelationGraphNode("O0", (2, 1), RelationGraphNodeType.STR) out_31 = RelationGraphNode("O0", (3, 1), RelationGraphNodeType.STR) out_col_2 = RelationGraphNode("O0", (-1, 2), RelationGraphNodeType.COLUMN) out_col_3 = RelationGraphNode("O0", (-1, 3), RelationGraphNodeType.COLUMN) check_edges(bars, [out_col_2], RelationGraphEdgeType.EQUALITY) check_edges(bazs, [out_col_3], RelationGraphEdgeType.EQUALITY) check_edges(ones, [out_01, out_21], RelationGraphEdgeType.EQUALITY) check_edges(twos, [out_11, out_31], RelationGraphEdgeType.EQUALITY)
def test_dict(self): input_df = pd.DataFrame([[1, 2], [3, 4]]) input_00 = RelationGraphNode("I0", (0, 0), get_node_type(input_df.iat[0, 0])) input_01 = RelationGraphNode("I0", (0, 1), get_node_type(input_df.iat[0, 1])) input_10 = RelationGraphNode("I0", (1, 0), get_node_type(input_df.iat[1, 0])) input_11 = RelationGraphNode("I0", (1, 1), get_node_type(input_df.iat[1, 1])) output = {"A": [1, 3], "B": [2, 4]} output_00 = RelationGraphNode("O0", (0, 0), get_node_type(output['A'][0])) output_01 = RelationGraphNode("O0", (0, 1), get_node_type(output['B'][0])) output_10 = RelationGraphNode("O0", (1, 0), get_node_type(output['A'][1])) output_11 = RelationGraphNode("O0", (1, 1), get_node_type(output['B'][1])) options = RelationGraphOptions() options.NODE_TYPES = True rel_graph: RelationGraph = RelationGraph.build_relation_graph( [input_df], output, options) rel_graph_edges = rel_graph.edges positional_edges = [ RelationGraphEdge(input_00, input_01, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_00, input_10, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_10, input_11, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(input_01, input_11, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(output_00, output_01, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(output_00, output_10, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(output_10, output_11, RelationGraphEdgeType.ADJACENCY), RelationGraphEdge(output_01, output_11, RelationGraphEdgeType.ADJACENCY) ] for edge in positional_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges)) equality_edges = [ RelationGraphEdge(input_00, output_00, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(input_10, output_10, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(input_01, output_01, RelationGraphEdgeType.EQUALITY), RelationGraphEdge(input_11, output_11, RelationGraphEdgeType.EQUALITY) ] for edge in equality_edges: self.assertTrue( edge in rel_graph_edges, "Could not find edge %s in set of edges:\n%s" % (edge, rel_graph_edges))