def __stock_tags_bulk_insert(df: pd.DataFrame, column_name: str, conn: psycopg2.extensions.connection, cursor: psycopg2.extensions.cursor) -> Any: """ Inserts all tag rows of df in the DB of the relationship between stock and tags. :param df: df with three columns of stock and a column of tag. :param column_name: str. The name of tag column. :param conn: psycopg2.extensions.connection. Connection to DB. :param cursor: cursor of DB. :return: None. """ for index, series in df.iterrows(): if not pd.isnull(series[column_name]): # first searchs the stock id of row. stock_id_query = "SELECT id FROM stock " \ "WHERE stock_name = '{}' AND exchange = '{}' AND ticker = '{}'".format( series['stock_name'], series['exchange'], series['ticker']) cursor.execute(stock_id_query) stock_id = cursor.fetchone()[0] # next, searchs the tag id of row. tag_id_query = "SELECT id FROM tag WHERE tag_name = '{}' AND category = '{}'".format( series[column_name], column_name) cursor.execute(tag_id_query) tag_id = cursor.fetchone()[0] # inserts stock_tag relation using stock_id and tag_id query = "INSERT INTO stock_tag (stock_id, tag_id) " \ "VALUES ('{}', '{}') ON CONFLICT DO NOTHING".format(stock_id, tag_id) cursor.execute(query) conn.commit()
def iine_country(cur: psycopg2.extensions.cursor, country: str) -> int: cur.execute("select id from countries where name=%s", (country, )) country_id = cur.fetchone() if country_id is None: cur.execute("insert into countries (name) values (%s) returning id", (country, )) country_id = cur.fetchone() cur.connection.commit() return country_id
def iine_person_type(cur: psycopg2.extensions.cursor, actor_type: str) -> int: cur.execute("select id from person_types where type=%s", (actor_type, )) type_id = cur.fetchone() if type_id is None: cur.execute("insert into person_types (type) values (%s) returning id", (actor_type, )) type_id = cur.fetchone() cur.connection.commit() return type_id
def ExportSession(self, cursor: psycopg2.extensions.cursor): if not self.session_id: args = (self.session_time.ToJsonString(), self.track, self.live_data) cursor.execute(SESSION_INSERT, args) self.session_id = cursor.fetchone()[0] self.timescale_conn.commit()
def check_and_mark_data_quality(cursor: psycopg2.extensions.cursor, log: logger.Log) -> None: """ Using `cursor`, mark rows in the stage.game_data table that satisfy data quality rules. Log to `log` a warning if any rows fail data quality. """ quality_check_sql = ''' UPDATE stage.game_data SET passed_data_quality_check = True WHERE game_id NOT IN ( SELECT distinct(game_id) FROM stage.game_data WHERE move_number ~ '\D' -- Must be an integer OR "column" ~ '\D' -- Must be an integer OR "column"::int > 4 -- The grid is 4x4 OR result NOT IN ('', 'win', 'draw') -- Valid values for result ) AND game_id NOT IN ( SELECT game_id FROM stage.game_data GROUP BY game_id HAVING COUNT(DISTINCT(player_id)) <> 2 -- A game should have 2 players ); ''' cursor.execute(quality_check_sql) cursor.execute("SELECT COUNT(*) FROM stage.game_data WHERE " "passed_data_quality_check = false;") number_bad = cursor.fetchone()[0] if number_bad > 0: log.write_warning( f'Rejected {number_bad} game records due to data quality.')
def _get_business_id(cur: psycopg2.extensions.cursor, business_identifier: str) -> str: """Return the business id for the given identifier.""" cur.execute(f"select id from businesses where identifier='{business_identifier}'") business_id = cur.fetchone() if not business_id: return '' return str(business_id[0])
def select_DiscordUser_from_database(member_id: int, cursor: psycopg2.extensions.cursor): try: cursor.execute(sql_select_DiscordUser, (member_id, )) return cursor.fetchone() except (Exception, psycopg2.Error) as error: print(f"Error while selecting from DiscordUser on database: {error}")
def process_log_file(cur: psycopg2.extensions.cursor, filepath: str): """ Given a connection (cursor) to a PostgreSQL database and a path to a JSON-(log)-file, load the file, filter its content, derive datetime attributse from the timestamp column and insert valid subsets of its data into the time, user and songplay tables. :param cur: Cursor :param filepath: Path to JSON file """ # open log file df = pd.read_json(filepath, orient="records", lines=True) # filter by NextSong action df = df.loc[(df["page"] == "NextSong"), :] # convert timestamp column to datetime df["ts"] = df["ts"].astype("datetime64[ms]") # insert time data records time_dict = { "timestamp": df.ts, "hour": df.ts.dt.hour, "day": df.ts.dt.day, "week_of_year": df.ts.dt.weekofyear, "month": df.ts.dt.month, "year": df.ts.dt.year, "weekday": df.ts.dt.weekday } time_df = pd.DataFrame.from_dict(time_dict) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df[["userId", "firstName", "lastName", "gender", "level"]] # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent) cur.execute(songplay_table_insert, songplay_data)
def verify_tables(cur: psycopg2.extensions.cursor) -> bool: """ Check if products table already exist :return: boolean if product table exist """ cur.execute( f"select exists(select * from information_schema.tables where table_name=%s)", (DB_TABLE, ), ) return cur.fetchone()[0]
def source_at_offset( cursor: psycopg2.extensions.cursor, source_info: SourceInfo ) -> typing.Union[None, int]: """Return the mz timestamp from a source if it has reached the desired offset.""" query = ( 'SELECT timestamp FROM mz_source_info WHERE source_name = %s and "offset" = %s' ) try: cursor.execute(query, (source_info.topic_name, source_info.offset)) # type: ignore if cursor.rowcount > 1: print("ERROR: More than one row returned when querying source offsets:") for row in cursor: print(f"\t{row}") sys.exit(1) if not cursor.rowcount: return None return int(cursor.fetchone()[0]) except psycopg2.errors.InternalError_: # type: ignore # The view is not yet ready to be queried return None
def check_and_mark_data_quality(cursor: psycopg2.extensions.cursor, log: logger.Log) -> None: """ Using `cursor`, mark rows in the stage.player_info table that satisfy data quality rules. Log to `log` a warning if any rows fail data quality. """ quality_check_sql = """ UPDATE stage.player_info SET passed_data_quality_check = true WHERE details ? 'data'; """ cursor.execute(quality_check_sql) cursor.execute("SELECT COUNT(*) FROM stage.player_info WHERE " "passed_data_quality_check = false;") number_bad = cursor.fetchone()[0] if number_bad > 0: log.write_warning( f'Rejected {number_bad} player records due to data quality.')
def get_song_artist(cur: psycopg2.extensions.cursor, song: str, artist: str, length: float) -> Optional[Tuple[str, str]]: """Fetches the Song ID and Artist ID associated with the provided song (name), artist (name) and song length. Args: cur (psycopg2.extensions.cursor): The active cursor used to fetch the data song (str): The name of the song to search artist (str): The name of the artist to search length (float): The length of the song to be searched Returns: Optional[Tuple[str, str]]: None if the arguments are invalid, or a Tuple containing the retrieved Song ID and Artist ID """ # get songid and artistid from song and artist tables cur.execute(song_select, (song, artist, length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None return songid, artistid
def ExportLap(self, lap: gps_pb2.Lap, cursor: psycopg2.extensions.cursor): """Export the lap data to timescale.""" args = (self.session_id, lap.number) cursor.execute(LAP_INSERT, args) self.lap_number_ids[lap.number] = cursor.fetchone()[0] self.timescale_conn.commit()
def process_queue(*, curs: psycopg2.extensions.cursor, area_root: pathlib.Path) -> None: # loop until the queue is empty while True: curs.execute('BEGIN;') curs.execute(''' DELETE FROM public.queue WHERE id = ( SELECT id FROM public.queue ORDER BY priority DESC, ts FOR UPDATE SKIP LOCKED LIMIT 1 ) RETURNING id, run, student_id, area_catalog, area_code, input_data::text, expires_at, link_only; ''') # fetch the next available queued item row = curs.fetchone() # if there are no more, return to waiting if row is None: curs.execute('COMMIT;') break try: queue_id, run_id, student_id, area_catalog, area_code, input_data, expires_at, link_only = row except ValueError as exc: curs.execute('COMMIT;') logger.exception( 'unexpected exception: wrong number of items in tuple from queue table - %s', exc) break area_id = area_catalog + '/' + area_code try: logger.info(f'[q={queue_id}] begin {student_id}::{area_id}') catalog_int = int(area_catalog.split('-')[0]) area_file = find_area(root=area_root, area_catalog=catalog_int, area_code=area_code) if not area_file: logger.error( 'could not find area spec for %s at or below catalog %s (%s), under %s', area_code, area_catalog, catalog_int, area_root) continue area_spec = load_area(area_file) # run the audit audit( curs=curs, student=json.loads(input_data), area_spec=area_spec, area_catalog=area_catalog, area_code=area_code, run_id=run_id, expires_at=expires_at, link_only=link_only, ) # once the audit is done, commit the queue's DELETE curs.execute('COMMIT;') logger.info(f'[q={queue_id}] commit {student_id}::{area_id}') except Exception as exc: # commit the deletion, just so it doesn't endlessly re-run itself curs.execute('COMMIT;') # log the exception logger.error(f'[q={queue_id}] error {student_id}::{area_id}; %s', exc) logger.info('queue is empty')
def process_queue(*, curs: psycopg2.extensions.cursor, area_root: str) -> None: # loop until the queue is empty while True: curs.execute('BEGIN;') curs.execute(''' DELETE FROM public.queue WHERE id = ( SELECT id FROM public.queue ORDER BY priority DESC, ts FOR UPDATE SKIP LOCKED LIMIT 1 ) RETURNING id, run, student_id, area_catalog, area_code, input_data::text; ''') # fetch the next available queued item row = curs.fetchone() # if there are no more, return to waiting if row is None: curs.execute('COMMIT;') break try: queue_id, run_id, student_id, area_catalog, area_code, input_data = row except Exception: curs.execute('COMMIT;') break try: area_id = area_catalog + '/' + area_code area_path = os.path.join(area_root, area_catalog, area_code + '.yaml') logger.info(f'[q={queue_id}] begin {student_id}::{area_id}') area_spec = load_areas(area_path)[0] # run the audit audit( curs=curs, student=json.loads(input_data), area_spec=area_spec, area_catalog=area_catalog, area_code=area_code, run_id=run_id, ) # once the audit is done, commit the queue's DELETE curs.execute('COMMIT;') logger.info(f'[q={queue_id}] commit {student_id}::{area_id}') except Exception as exc: # commit the deletion, just so it doesn't endlessly re-run itself curs.execute('COMMIT;') # record the exception in Sentry for debugging sentry_sdk.capture_exception(exc) # log the exception logger.error(f'[q={queue_id}] error {student_id}::{area_id}') logger.info(f'queue is empty')
def process_log_file(cur: psycopg2.extensions.cursor, filepath: str): """Process log file and insert into database Args: cur (psycopg2.extensions.cursor): postgres cursor filepath (str): log filepath """ # open log file df = pd.read_json(filepath, lines=True) # filter by NextSong action df = df[df.page == "NextSong"] # convert timestamp column to datetime t = pd.to_datetime(df.ts, unit="ms") # insert time data records time_data = list(( list(t.values), list(t.dt.hour.values), list(t.dt.day.values), list(t.dt.week.values), list(t.dt.month.values), list(t.dt.year.values), list(t.dt.weekday.values), )) column_labels = ("start_time", "hour", "day", "week", "month", "year", "weekday") # create time df from dictionary time_df = pd.DataFrame({k: v for k, v in zip(column_labels, time_data)}) for _, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df[["userId", "firstName", "lastName", "gender", "level"]] # insert user records for _, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for _, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = ( pd.to_datetime(row.ts), row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent, ) cur.execute(songplay_table_insert, songplay_data)