def insert_data(conn: psycopg2.connect, df: pd.DataFrame) -> None: ''' Bulk insert dataframe into advertisementdata table. This function was inspired by Naysan Saran's article "Pandas to PostgreSQL using Psycopg2: Bulk Insert Performance Benchmark", in which the author chose a variety of bulk insert methods and compared their execution time. Saving the dataframe to a StringIO object and then copying this to the database proved to be the most efficient when dealing with millions of records. Source: https://naysan.ca/2020/05/09/pandas-to-postgresql-using-psycopg2-bulk-insert-performance-benchmark/ ''' set_index(conn, df) buffer = StringIO() df.to_csv(buffer, index_label='id', header=False) buffer.seek(0) cursor = conn.cursor() try: cursor.copy_from(buffer, 'advertisementdata', sep=",") conn.commit() except (Exception, psycopg2.DatabaseError) as error: logging.error(f"Error inserting data: {error}") conn.rollback() cursor.close() cursor.close()
def create_table_weeks(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the WEEKS table ''' table_name = 'WEEKS' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , WEEK_NUMBER SMALLINT , MATCHUP_PERIOD SMALLINT , REG_SEASON_FLAG SMALLINT , CONSTRAINT WEEKS_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, WEEK_NUMBER) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def insert_mood(conn: psycopg2.connect, update: Update, user_data: Dict[str, Union[str, List[str]]]) -> None: sql = """ INSERT INTO moodtracker (message_id, chat_id, mood, reason, note, date) VALUES(%s, %s, %s, %s, %s, %s) """ cur = conn.cursor() chat_id = update.message.chat_id mood = user_data['mood'] note = user_data['note'] reasons = user_data['reasons'] date = datetime.now() message_id = str(chat_id) + str(date.strftime("%Y%m%d%H%M%S%f")) if len(reasons) == 0: reasons = None cur.execute(sql, (message_id, chat_id, mood, reasons, note, date)) else: for reason in user_data['reasons']: cur.execute(sql, (message_id, chat_id, mood, reason, note, date)) conn.commit() cur.close()
def create_table_teams(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the TEAMS table ''' table_name = 'TEAMS' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , TEAM_ID SMALLINT , MANAGER_ID VARCHAR(50) , TEAM_NAME VARCHAR(50) , MANAGER_NAME VARCHAR(50) , ESPN_NAME VARCHAR(50) , CONSTRAINT TEAMS_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, TEAM_ID) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def upsert_rows(conn: psycopg2.connect, df: pd.DataFrame, table: str, pkeys: list) -> None: """ Using cursor.mogrify() to build the bulk insert query then cursor.execute() to execute the query """ # Create a list of tupples from the dataframe values tuples = [tuple(x) for x in df.to_numpy()] tuples_str = ', '.join(map(str, tuples)) # Comma-separated dataframe columns cols = ','.join(list(df.columns)) insert_statement = "INSERT INTO %s(%s) VALUES %s" % (table, cols, tuples_str) on_conflict_statement = 'ON CONFLICT (' + ', '.join(map(str, pkeys)) + ')' do_update_statement = _create_update_set_statement(list(df.columns)) # SQL quert to execute query = insert_statement + ' ' + on_conflict_statement + ' ' + do_update_statement cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1 cursor.close()
def create_table_scores(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the WEEKLY_SCORES table ''' table_name = 'SCORES' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , WEEK_NUMBER BIGINT , TEAM_ID SMALLINT , TEAM_ID_OPP SMALLINT , SCORE NUMERIC(5, 2) , SCORE_OPP NUMERIC(5, 2) , WLT_POINTS NUMERIC(2, 1) , WIN_IND SMALLINT , LOSS_IND SMALLINT , TIE_IND SMALLINT , ALL_PLAY_WLT_POINTS NUMERIC(3, 1) , ALL_PLAY_WINS SMALLINT , ALL_PLAY_LOSSES SMALLINT , ALL_PLAY_TIES SMALLINT , CUM_SCORE NUMERIC(6, 2) , CUM_SCORE_OPP NUMERIC(6, 2) , CUM_WLT_POINTS NUMERIC(3, 1) , CUM_WINS SMALLINT , CUM_LOSSES SMALLINT , CUM_TIES SMALLINT , CUM_ALL_PLAY_WLT_POINTS NUMERIC(4, 1) , CUM_ALL_PLAY_WINS SMALLINT , CUM_ALL_PLAY_LOSSES SMALLINT , CUM_ALL_PLAY_TIES SMALLINT , CUM_SCORE_PER_WEEK NUMERIC(5, 2) , CUM_SCORE_OPP_PER_WEEK NUMERIC(5, 2) , CUM_ALL_PLAY_WLT_POINTS_PER_WEEK NUMERIC(3, 1) , RECORD VARCHAR(10) , ALL_PLAY_RECORD VARCHAR(10) , STANDINGS SMALLINT , HOME_OR_AWAY VARCHAR(10) , CONSTRAINT WEEKLY_SCORES_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID, WEEK_NUMBER, TEAM_ID) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def add_timer_to_db(conn: psycopg2.connect, cursor, user_id: int, chat_id: int, time: datetime): """Добавляет таймер в БД""" print('add_timer_to_db') time = time - timedelta(hours=5) cursor.execute( f"""INSERT INTO alerts (user_id, chat_id, time) VALUES ({user_id}, {chat_id}, '{time}')""" ) conn.commit()
def add_chat_to_db(conn: psycopg2.connect, cursor, chat_name: str, chat_id: int): """Добавляет чат в список чатов. Если чат уже есть в БД, то возвращает False""" cursor.execute(f"""SELECT * FROM chats WHERE chat_id = {int(chat_id)}""") chat = cursor.fetchone() if not chat: cursor.execute( f"""INSERT INTO chats (chat_name, chat_id) VALUES ('{chat_name}', {int(chat_id)})""" ) conn.commit()
def copy_s3_to_staging(cur: psycopg2, conn: psycopg2.connect) -> None: """ Copy the contents of the S3 buckets to the staging tables in the database :param cur: PostgreSQL cursor :param conn: PostgreSQL connection object :return: None """ for query in copy_table_queries: logger.debug("Copying data to staging table as\n{}".format(query)) cur.execute(query) conn.commit()
def create_staging_tables(cur: psycopg2, conn: psycopg2.connect) -> None: """ Create the temporary tables used for staging. :param cur: PostgreSQL cursor :param conn: PostgreSQL connection object :return: None """ for query in insert_temp_table_queries: logger.debug("Creating staging table as \n{}".format(query)) cur.execute(query) conn.commit()
def upload_comment(self, comment_data: Comment, conn: psycopg2.connect): cur = conn.cursor() datas = (comment_data.message, comment_data.author_name, comment_data.thumbnails, comment_data.timestamp_msec, comment_data.timestamp_text, comment_data.purchase_amount, comment_data.movie_id) cur.execute( "INSERT INTO comment(message,author_name,thumbnails,timestamp_msec,timestamp_text,purchase_amount,movie_id) VALUES (%s,%s,%s,%s,%s,%s,%s);", datas) conn.commit() cur.close()
def execute_query(conn: psycopg2.connect, query: str) -> None: cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: logging.error(f"Unable to execute query. Error: {error}") conn.rollback() cursor.close() cursor.close()
def upload_comments(self, comment_datas: List[Comment], conn: psycopg2.connect): cur = conn.cursor() list_data = [(comment_data.message, comment_data.author_name, comment_data.thumbnails, comment_data.timestamp_msec, comment_data.timestamp_text, comment_data.purchase_amount, comment_data.movie_id) for comment_data in comment_datas] cur.executemany( "INSERT INTO comment(message,author_name,thumbnails,timestamp_msec,timestamp_text,purchase_amount,movie_id) VALUES (%s,%s,%s,%s,%s,%s,%s);", list_data) conn.commit() cur.close()
def create_table(conn: psycopg2.connect) -> None: TABLE_CREATION = """ CREATE TABLE IF NOT EXISTS moodtracker ( message_id varchar(40) NOT NULL, chat_id varchar(10) NOT NULL, mood varchar(10), reason varchar(25), note varchar(255), date timestamp ) """ cur = conn.cursor() cur.execute(TABLE_CREATION) cur.close() conn.commit()
def drop_tables(cur: psycopg2.connect, conn: psycopg2.connect) -> None: """ Drops each table using the queries in `drop_table_queries` list. Parameters ---------- cur : psycopg2.cursor Cursor for accessing database with psycopg. conn : psycopg2.connect Database connection instance. """ for query in drop_table_queries: cur.execute(query) conn.commit()
def drop_rows(conn: psycopg2.connect, table: str, where_condition: str) -> None: ''' Drops rows from a table based on a set of conditions ''' query = f''' DELETE FROM {table} WHERE {where_condition} ''' cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1 cursor.close()
def run_sql_etl(sql: str, conn: pg.connect, table_name: str): """Runs given SQL query on the provided PostgreSQL connection obj. Args: sql (str): SQL script to run conn (psycopg2.connect): PostgreSQL database connection table_name (str): Table name for logging purposes """ func_start_time = time() log.debug('Running SQL ETL for "%s" table', table_name) with conn.cursor() as cur: cur.execute(sql) conn.commit() log.info( 'SQL ETL for table "%s" completed in: %s seconds', table_name, round(time() - func_start_time, 3), )
def process_data( cur: psycopg2.connect, conn: psycopg2.connect, filepath: str, func: Callable, ) -> None: """ Perform data processing for specific raw files. Parameters ---------- cur : psycopg2.cursor Cursor for accessing database with psycopg. conn : psycopg2.connect Database connection instance. filepath : str Path for target file. func : Callable Function to use to process file. """ # get all files matching extension from directory all_files = [] for root, dirs, files in os.walk(filepath): files = glob.glob(os.path.join(root, '*.json')) for f in files: all_files.append(os.path.abspath(f)) # get total number of files found num_files = len(all_files) print('{} files found in {}'.format(num_files, filepath)) # iterate over files and process for i, datafile in enumerate(all_files, 1): func(cur, datafile) conn.commit() print('{}/{} files processed.'.format(i, num_files))
def bulk_load_df(data: pd.DataFrame, table_name: str, conn: pg.connect): """Bulk inserts a pandas dataframe into PostgreSQL table Args: data (pandas.Dataframe): Data for insertion table_name (str): Table name for logging purposes conn (psycopg2.connect): PostgreSQL database connection """ buffer = StringIO() buffer.write(data.to_csv(index=None, header=None, na_rep='')) buffer.seek(0) with conn.cursor() as cur: cur.copy_from( buffer, table_name, columns=data.columns, sep=',', null='', ) conn.commit()
def add_curator_to_chat(conn: psycopg2.connect, cursor, chat_id: int, user_id: int, username: str): """Добавляет куратора чата в БД""" print('add_curator_to_chat') cursor.execute( f"""SELECT user_id FROM curators WHERE chat_id = {chat_id}""") rows = cursor.fetchall() print(chat_id) print(user_id) print(rows) is_in_db = False if rows: for row in rows: print(row) if row['user_id'] == user_id: print('USER id in DB!') is_in_db = True if is_in_db == False: cursor.execute( f"""INSERT INTO curators (chat_id, user_id, username) VALUES ({chat_id}, {user_id}, '{username}')""" ) conn.commit()
def insert_rows(conn: psycopg2.connect, df: pd.DataFrame, table: str) -> None: ''' Inserts the df values into the DB table ''' # Create a list of tupples from the dataframe values tuples = [tuple(x) for x in df.to_numpy()] tuples_str = ', '.join(map(str, tuples)) # Comma-separated dataframe columns cols = ','.join(list(df.columns)) # SQL quert to execute query = "INSERT INTO %s(%s) VALUES %s" % (table, cols, tuples_str) cursor = conn.cursor() try: cursor.execute(query) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print("Error: %s" % error) conn.rollback() cursor.close() return 1 cursor.close()
def create_table_settings(conn: psycopg2.connect, overwrite: bool=False) -> None: ''' Creates the columns and relationships of the SETTINGS table ''' table_name = 'SETTINGS' if overwrite == True: drop_table_statement = f'''DROP TABLE IF EXISTS {table_name};''' else: drop_table_statement = '' create_table_statment = f''' {drop_table_statement} CREATE TABLE {table_name} ( LEAGUE_ID BIGINT , SEASON_ID SMALLINT , PLAYOFF_SEEDING_RULE VARCHAR(100) , PLAYOFF_SEEDING_RULE_BY SMALLINT , NUM_PLAYOFF_TEAMS SMALLINT , FIRST_SCORING_PERIOD SMALLINT , FINAL_SCORING_PERIOD SMALLINT , PLAYOFF_WEEK_START SMALLINT , SCORING_TYPE VARCHAR(50) , REG_SEASON_MATCHUP_TIEBREAKER VARCHAR(50) , PLAYOFF_MATCHUP_TIEBREAKER VARCHAR(50) , HOME_TEAM_BONUS SMALLINT , CONSTRAINT SETTINGS_PKEY PRIMARY KEY(LEAGUE_ID, SEASON_ID) ); ''' cursor = conn.cursor() cursor.execute(create_table_statment) conn.commit() cursor.close()
def parse_and_insert(cur: psycopg2, conn: psycopg2.connect) -> None: """ Take the data in the staging tables and distribute them into their proper Redshift analytic tables. :param cur: PostgreSQL cursor :param conn: PostgreSQL connection object :return: """ logger.info("Fast inserting records into songs table.") cur.execute(song_table_fast_insert) logger.info("Fast inserting records into artists table.") cur.execute(artist_table_fast_insert) conn.commit() cur.execute(events_select) data = cur.fetchall() columns = [ 'user_id', 'first_name', 'last_name', 'gender', 'level', 'ts', 'artist_name', 'song_name', 'length', 'location', 'session_id', 'user_agent' ] df = pd.DataFrame(data, columns=columns) logger.info( "Slow inserting records into time, users, and songplay tables.") _insert_df_to_time_user_songplay(cur, df) conn.commit()
def commit(connection: psycopg2.connect) -> None: """Applies changes to the database""" connection.commit()