def datagen_filter(): mode = random.choice(["equality-inequality", "relop"]) if mode == "equality-inequality": while True: try: df = generate_random_dataframe( DfConfig(min_width=2, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) col = random.choice(list(df.columns)) value = random.choice(list(set(df.loc[:, col]))) op = random.choice(["==", "!="]) return [df], { "filter_mode": [mode], "filter_column_eq": [col], "filter_value_eq": [value], "filter_eq_op": [op] } except: pass else: while True: try: df = generate_random_dataframe( DfConfig(min_width=2, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) numeric_cols = df.select_dtypes('number').columns if len(numeric_cols) < 1: continue col = random.choice(list(numeric_cols)) value = random.choice(list(set(df.loc[:, col]))) op = random.choice(["<", ">"]) return [df], { "filter_mode": [mode], "filter_column_relop": [col], "filter_value_relop": [value], "filter_relop": [op] } except: pass
def datagen_fillna(seed: int): mode = ["method", "value"][seed % 2] axis, method = list( itertools.product(["index", "columns"], ["backfill", "pad"]))[(seed // 2) % 4] while True: try: df = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1, nan_prob=0.5)) if mode == "value": df.fillna(0) return [df], [0], {"fillna_mode": ["value"]} else: return [df], [], { "fillna_method": [method], "fillna_axis": [axis], } except: pass
def datagen_combine_first(seed: int): while True: df1 = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, nan_prob=0.5, max_column_levels=1)) on_columns = random.sample(list(df1.columns), random.randint(1, df1.shape[1])) df2_width = random.randint(len(on_columns), MAX_COLS) df2 = generate_random_dataframe( DfConfig(num_cols=df2_width, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1, nan_prob=0.5, col_prefix="DF2")) replaced_cols = random.sample(list(df2.columns), len(on_columns)) df2 = df2.rename(columns=dict(zip(replaced_cols, on_columns))) df1_items = [tuple(i) for i in df1.loc[:, on_columns].values] df2_items = [tuple(i) for i in df2.loc[:, on_columns].values] new_df1_items = random.sample(df1_items + df2_items, df1.shape[0]) new_df2_items = random.sample(df1_items + df2_items, df2.shape[0]) for idx, items in enumerate(new_df1_items): df1.loc[idx, on_columns] = items for idx, items in enumerate(new_df2_items): df2.loc[idx, on_columns] = items try: df1 = df1.T df2 = df2.T res = df1.combine_first(df2) if res.shape[0] == 0 or res.shape[1] == 0: continue except: continue return [df1, df2], [], {}
def datagen_default(): return [ generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) ], {}
def datagen_separate(): df = generate_random_dataframe( DfConfig(min_width=3, max_index_levels=1, max_column_levels=1)) unite_cols = random.sample(list(df.columns), random.choice([2, 3])) df["NEW-VALS"] = [ "@".join(map(str, vals)) for vals in zip(*[df[c] for c in unite_cols]) ] df.drop(columns=unite_cols, inplace=True) new_cols = list(df.columns) random.shuffle(new_cols) return [df], {"separate_split_col": ["NEW-VALS"]}
def datagen_dropna(seed: int): df = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1, nan_prob=0.5)) return [df], [], {}
def datagen_mutate(): operation = random.choice(["normalize", "div"]) if operation == 'normalize': while True: df = generate_random_dataframe( DfConfig(min_width=2, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) numeric_cols = df.select_dtypes('number').columns if len(numeric_cols) == 0: continue return [df], { "mutate_operation": ["normalize"], "mutate_col_args_normalize": [random.choice(numeric_cols)] } else: while True: df = generate_random_dataframe( DfConfig(min_width=2, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) numeric_cols = df.select_dtypes('number').columns if len(numeric_cols) < 2: continue col_args = random.sample(list(numeric_cols), 2) return [df], { "mutate_operation": ["div"], "mutate_col_arg1": [col_args[0]], "mutate_col_arg2": [col_args[1]], }
def datagen_separate(seed: int): while True: try: df = generate_random_dataframe( DfConfig(min_width=2, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) unite_cols = random.sample(list(df.columns), random.randint(2, df.shape[1])) df = df.drop(columns=unite_cols).assign(MY_NEW_COL=df[ unite_cols[0]].str.cat(df[unite_cols[1:]], sep='_')) return [df], [], {"separate_col": ["MY_NEW_COL"]} except: pass
def datagen_filtering_expr(seed: int): while True: try: df = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) column = random.choice(list(df.columns)) value = random.choice(list(df[column])) op = [">", "<", "==", "!="][seed % 4] expr = f"`{column}` {op} {value!r}" df.query(expr) return [df], [expr], {"filtering_expr_expression": [expr]} except: pass
def datagen_groupby_transform(seed: int): while True: op = candidates_groupby_transform_op[ seed % len(candidates_groupby_transform_op)] num_group_cols = (seed // len(candidates_groupby_transform_op)) % 2 + 1 df = generate_random_dataframe( DfConfig(min_width=max(MIN_COLS, num_group_cols + 1), max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) # Find group columns such that there is at least one duplicate key for group_cols in itertools.combinations(list(df.columns), num_group_cols): if len(df.groupby(list(group_cols)).groups) < df.shape[0]: return [df], [], { 'groupby_transform_by_cols': [list(group_cols)], 'groupby_transform_op': [op] }
def datagen_filtering_contains(seed: int): while True: try: df = generate_random_dataframe( DfConfig(min_width=MIN_COLS, max_width=MAX_COLS, min_height=MIN_ROWS, max_height=MAX_ROWS, max_index_levels=1, max_column_levels=1)) column = random.choice(list(df.columns)) values = set( random.sample(list(df[column]), random.randint(1, df.shape[1] - 1))) return [df], [values], { "filtering_contains_filter_col": [column], "filtering_contains_collection": [values], } except: pass