예제 #1
0
def datagen_inner_join():
    while True:
        df1 = generate_random_dataframe(
            DfConfig(min_width=MIN_COLS,
                     max_width=3,
                     max_index_levels=1,
                     max_column_levels=1))
        on_columns = random.sample(list(df1.columns),
                                   random.randint(1, df1.shape[1]))
        df2_width = random.randint(len(on_columns), MAX_COLS)
        df2 = generate_random_dataframe(
            DfConfig(num_cols=df2_width,
                     max_index_levels=1,
                     max_column_levels=1,
                     col_prefix="DF2"))
        replaced_cols = random.sample(list(df2.columns), len(on_columns))
        df2 = df2.rename(columns=dict(zip(replaced_cols, on_columns)))

        df1_items = [tuple(i) for i in df1.loc[:, on_columns].values]
        df2_items = [tuple(i) for i in df2.loc[:, on_columns].values]

        new_df1_items = random.sample(df1_items + df2_items, df1.shape[0])
        new_df2_items = random.sample(df1_items + df2_items, df2.shape[0])

        for idx, items in enumerate(new_df1_items):
            df1.loc[idx, on_columns] = items

        for idx, items in enumerate(new_df2_items):
            df2.loc[idx, on_columns] = items

        try:
            res = RInterpreter.inner_join(df1, df2)
            if res.shape[0] == 0 or res.shape[1] == 0:
                continue

        except:
            continue

        return [df1, df2], {}
예제 #2
0
def gen_inner_join(df1: pd.DataFrame,
                   df2: pd.DataFrame,
                   g_df1: DataFrameGraph,
                   g_df2: DataFrameGraph,
                   datagen: bool = False):
    """
    INNER_JOIN
    ------
    Example:
      inner_join(df1, df2)

        c1 c2 c3      c4 c2 c5          c1 c2 c3 c4 c5
      0  a  b  c    0  x  g  z        0  a  b  c  w  u
      1  d  g  c    1  w  b  u   ->   1  f  b  h  w  u
      2  f  b  h ,  2  y  g  j        2  d  g  c  x  z
                                      3  d  g  c  y  j

    ---------------
    Graph Abstraction:
    - EQUAL edges from all the columns in df1 and df2 to the corresponding column in the output.
    - EQUAL edges from all the rows in df1 and df2 to the corresponding row in the output, if included.
    - EQUAL edge between the input deletion nodes and the output deletion node.
    - DELETE edges from all the non-included cells in both df1 and df2 to the deletion node of the output.
    """

    result = RInterpreter.inner_join(df1, df2)
    call_str = f"inner_join({{inp1}}, {{inp2}})"

    #  --------------------------------------------------------------------------------------------------------------  #
    #  Graph Construction
    #  --------------------------------------------------------------------------------------------------------------  #

    g_res = DataFrameGraph(result)
    graph = GraphRLang.assemble([g_df1, g_df2, g_res])
    added_edges: List[Edge] = []

    col_map_df1 = {c.value: c
                   for c in g_df1.columns
                   }  # Map from df1's columns to their column nodes
    col_map_df2 = {c.value: c
                   for c in g_df2.columns
                   }  # Map from df2's columns to their column nodes
    col_map_res = {c.value: c
                   for c in g_res.columns
                   }  # Map from result's columns to their column nodes

    #  - EQUAL edges from all the columns in df1 and df2 to the corresponding column in the output.
    for c, node in itertools.chain(col_map_df1.items(), col_map_df2.items()):
        added_edges.append(Edge(node, col_map_res[c], ELabel.EQUAL))

    #  Get the merge cols
    merge_cols = list(set(col_map_df1.keys()) & set(col_map_df2.keys()))

    #  Get the indices for each value tuple for df1 and df2 and result
    value_dict_df1 = collections.defaultdict(list)
    value_dict_df2 = collections.defaultdict(list)
    value_dict_res = collections.defaultdict(list)

    for idx, values in zip(df1.index, df1.loc[:, merge_cols].values):
        values = tuple(values)
        value_dict_df1[values].append(idx)

    for idx, values in zip(df2.index, df2.loc[:, merge_cols].values):
        values = tuple(values)
        value_dict_df2[values].append(idx)

    for idx, values in zip(result.index, result.loc[:, merge_cols].values):
        values = tuple(values)
        value_dict_res[values].append(idx)

    #  - EQUAL edges from all the rows in df1 and df2 to the corresponding row in the output, if included.
    #  - DELETE edges from all the non-included cells in both df1 and df2 to the deletion node of the output.
    deleted_df1 = set(df1.index)
    deleted_df2 = set(df2.index)

    for value, res_indices in value_dict_res.items():
        df1_indices = value_dict_df1[value]
        df2_indices = value_dict_df2[value]
        deleted_df1.difference_update(df1_indices)
        deleted_df2.difference_update(df2_indices)
        for idx_res, (idx1,
                      idx2) in zip(res_indices,
                                   itertools.product(df1_indices,
                                                     df2_indices)):
            for c in df1.columns:
                added_edges.append(
                    Edge(g_df1.loc[idx1, c], g_res.loc[idx_res, c],
                         ELabel.EQUAL))
            for c in df2.columns:
                added_edges.append(
                    Edge(g_df2.loc[idx2, c], g_res.loc[idx_res, c],
                         ELabel.EQUAL))

            for c in merge_cols:
                added_edges.append(
                    Edge(g_df1.loc[idx1, c], g_df2.loc[idx2, c], ELabel.EQUAL))
                added_edges.append(
                    Edge(g_df2.loc[idx2, c], g_df1.loc[idx1, c], ELabel.EQUAL))

    for idx1 in deleted_df1:
        for c in df1.columns:
            added_edges.append(
                Edge(g_df1.loc[idx1, c], g_res.deletion_node, ELabel.DELETE))

    for idx2 in deleted_df2:
        for c in df2.columns:
            added_edges.append(
                Edge(g_df2.loc[idx2, c], g_res.deletion_node, ELabel.DELETE))

    #  - EQUAL edge between the input deletion nodes and the output deletion node.
    added_edges.append(
        Edge(g_df1.deletion_node, g_res.deletion_node, ELabel.EQUAL))
    added_edges.append(
        Edge(g_df2.deletion_node, g_res.deletion_node, ELabel.EQUAL))

    #  Add all the edges to the graph in one go.
    graph.add_nodes_and_edges(edges=added_edges)

    #  --------------------------------------------------------------------------------------------------------------  #
    #  Add information about arguments
    #  --------------------------------------------------------------------------------------------------------------  #

    #  No arguments for this component.
    return result, call_str, graph, g_res