def datagen_inner_join(): while True: df1 = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=3, max_index_levels=1, max_column_levels=1)) on_columns = random.sample(list(df1.columns), random.randint(1, df1.shape[1])) df2_width = random.randint(len(on_columns), MAX_COLS) df2 = generate_random_dataframe( DfConfig(num_cols=df2_width, max_index_levels=1, max_column_levels=1, col_prefix="DF2")) replaced_cols = random.sample(list(df2.columns), len(on_columns)) df2 = df2.rename(columns=dict(zip(replaced_cols, on_columns))) df1_items = [tuple(i) for i in df1.loc[:, on_columns].values] df2_items = [tuple(i) for i in df2.loc[:, on_columns].values] new_df1_items = random.sample(df1_items + df2_items, df1.shape[0]) new_df2_items = random.sample(df1_items + df2_items, df2.shape[0]) for idx, items in enumerate(new_df1_items): df1.loc[idx, on_columns] = items for idx, items in enumerate(new_df2_items): df2.loc[idx, on_columns] = items try: res = RInterpreter.inner_join(df1, df2) if res.shape[0] == 0 or res.shape[1] == 0: continue except: continue return [df1, df2], {}
def gen_inner_join(df1: pd.DataFrame, df2: pd.DataFrame, g_df1: DataFrameGraph, g_df2: DataFrameGraph, datagen: bool = False): """ INNER_JOIN ------ Example: inner_join(df1, df2) c1 c2 c3 c4 c2 c5 c1 c2 c3 c4 c5 0 a b c 0 x g z 0 a b c w u 1 d g c 1 w b u -> 1 f b h w u 2 f b h , 2 y g j 2 d g c x z 3 d g c y j --------------- Graph Abstraction: - EQUAL edges from all the columns in df1 and df2 to the corresponding column in the output. - EQUAL edges from all the rows in df1 and df2 to the corresponding row in the output, if included. - EQUAL edge between the input deletion nodes and the output deletion node. - DELETE edges from all the non-included cells in both df1 and df2 to the deletion node of the output. """ result = RInterpreter.inner_join(df1, df2) call_str = f"inner_join({{inp1}}, {{inp2}})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df1, g_df2, g_res]) added_edges: List[Edge] = [] col_map_df1 = {c.value: c for c in g_df1.columns } # Map from df1's columns to their column nodes col_map_df2 = {c.value: c for c in g_df2.columns } # Map from df2's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges from all the columns in df1 and df2 to the corresponding column in the output. for c, node in itertools.chain(col_map_df1.items(), col_map_df2.items()): added_edges.append(Edge(node, col_map_res[c], ELabel.EQUAL)) # Get the merge cols merge_cols = list(set(col_map_df1.keys()) & set(col_map_df2.keys())) # Get the indices for each value tuple for df1 and df2 and result value_dict_df1 = collections.defaultdict(list) value_dict_df2 = collections.defaultdict(list) value_dict_res = collections.defaultdict(list) for idx, values in zip(df1.index, df1.loc[:, merge_cols].values): values = tuple(values) value_dict_df1[values].append(idx) for idx, values in zip(df2.index, df2.loc[:, merge_cols].values): values = tuple(values) value_dict_df2[values].append(idx) for idx, values in zip(result.index, result.loc[:, merge_cols].values): values = tuple(values) value_dict_res[values].append(idx) # - EQUAL edges from all the rows in df1 and df2 to the corresponding row in the output, if included. # - DELETE edges from all the non-included cells in both df1 and df2 to the deletion node of the output. deleted_df1 = set(df1.index) deleted_df2 = set(df2.index) for value, res_indices in value_dict_res.items(): df1_indices = value_dict_df1[value] df2_indices = value_dict_df2[value] deleted_df1.difference_update(df1_indices) deleted_df2.difference_update(df2_indices) for idx_res, (idx1, idx2) in zip(res_indices, itertools.product(df1_indices, df2_indices)): for c in df1.columns: added_edges.append( Edge(g_df1.loc[idx1, c], g_res.loc[idx_res, c], ELabel.EQUAL)) for c in df2.columns: added_edges.append( Edge(g_df2.loc[idx2, c], g_res.loc[idx_res, c], ELabel.EQUAL)) for c in merge_cols: added_edges.append( Edge(g_df1.loc[idx1, c], g_df2.loc[idx2, c], ELabel.EQUAL)) added_edges.append( Edge(g_df2.loc[idx2, c], g_df1.loc[idx1, c], ELabel.EQUAL)) for idx1 in deleted_df1: for c in df1.columns: added_edges.append( Edge(g_df1.loc[idx1, c], g_res.deletion_node, ELabel.DELETE)) for idx2 in deleted_df2: for c in df2.columns: added_edges.append( Edge(g_df2.loc[idx2, c], g_res.deletion_node, ELabel.DELETE)) # - EQUAL edge between the input deletion nodes and the output deletion node. added_edges.append( Edge(g_df1.deletion_node, g_res.deletion_node, ELabel.EQUAL)) added_edges.append( Edge(g_df2.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # # No arguments for this component. return result, call_str, graph, g_res