def drop_table(tbl_name: str, eng: sa.engine.Engine, dispose_eng=False): try: if eng.has_table(tbl_name): with eng.begin() as con: con.execute(f"DROP TABLE {tbl_name}") finally: if dispose_eng: eng.dispose()
def merge_to_table(eng: sa.engine.Engine, df: pd.DataFrame, table_name: str, compare_columns: List[str], update_columns: List[str], dtypes: dict, temp_table_name: str = None, dispose_eng=False): """ Merges the dataframe into an existing table by creating a temp table with for the df and the merging it into the existing one. For rows with matching $compare columns we UPDATE the other values in $UPDATE_COLUMNS Args: eng: An engine object connecting the db df: The dataframe we want to insert to the DB table_name: The existing table's name compare_columns: The columns we want to compare existing rows with update_columns: The columns we want to update in case a matching row is found temp_table_name: optional, a name for the temp table for the DB dtypes: The data-types for each column in the DB dispose_eng: Whether to dispose of the engine after the read Returns: None """ table_name = table_name.lower() # fixes stuff for postgres if df.empty: return if dtypes is None: dtypes = {} if temp_table_name is None: temp_table_name = get_temp_table_name() if eng.dialect.name.lower() == "oracle" and (len(temp_table_name) > MAX_TABLE_NAME_ORACLE or \ len(table_name) > MAX_TABLE_NAME_ORACLE): raise Exception('table name is too long') if len(df) > 200_000: chunk_size = 100_000 for i in tqdm(range(0, len(df), chunk_size), desc=f"Merging into {table_name}", unit="100_000 chunk"): df_chunk = df.iloc[i:min(len(df), i + chunk_size)] merge_to_table(eng, df_chunk, table_name, compare_columns, update_columns, dtypes=dtypes) else: try: # logger = logging.getLogger() # logger.info(f"Writing {len(df)} rows to {table_name} table") if not eng.has_table(table_name): insert_into_table(eng, df, table_name, dtypes, compare_columns) else: if eng.dialect.name.lower() not in ("oracle", "postgresql"): raise RuntimeError( f"merge into does not work for {eng.dialect.name}") insert_into_table(eng, df, temp_table_name, dtypes, compare_columns) if eng.dialect.name.lower() == "oracle": on_statment = "\nAND ".join( [f"curr.{col} = tmp.{col}" for col in compare_columns]) set_statment = "\n,".join( [f"curr.{col} = tmp.{col}" for col in update_columns]) all_columns = compare_columns + update_columns all_columns_names = ",".join(all_columns) all_columns_values = ",".join( [f"tmp.{col}" for col in all_columns]) sql = f""" merge into {table_name} curr using (select {all_columns_names} from {temp_table_name}) tmp on ({on_statment}) when matched then update set {set_statment} when not matched then insert ({all_columns_names}) values ({all_columns_values}) """ else: # postgresql set_statment = ",".join([ f"{col} = EXCLUDED.{col}" for col in update_columns ]) # postgres syntax all_columns = compare_columns + update_columns all_columns_names = ",".join(all_columns) from coord2vec.common.db.postgres import get_index_str_for_unique on_statment = get_index_str_for_unique( compare_columns, dtypes) sql = f""" INSERT INTO {table_name} ({all_columns_names}) SELECT {all_columns_names} FROM {temp_table_name} tmp ON CONFLICT ({on_statment}) DO UPDATE SET {set_statment}; """ # can fail if no key is saved on the on_statement columns with eng.begin() as con: con.execute(sql) con.execute(f"drop table {temp_table_name}") finally: if eng.has_table(temp_table_name): with eng.begin() as con: con.execute(f"drop table {temp_table_name}") if dispose_eng: eng.dispose()