예제 #1
0
def __stock_tags_bulk_insert(df: pd.DataFrame, column_name: str,
                             conn: psycopg2.extensions.connection,
                             cursor: psycopg2.extensions.cursor) -> Any:
    """
        Inserts all tag rows of df in the DB of the relationship between stock and tags.
    :param df: df with three columns of stock and a column of tag.
    :param column_name: str. The name of tag column.
    :param conn: psycopg2.extensions.connection. Connection to DB.
    :param cursor: cursor of DB.
    :return: None.
    """
    for index, series in df.iterrows():
        if not pd.isnull(series[column_name]):
            # first searchs the stock id of row.
            stock_id_query = "SELECT id FROM stock " \
                             "WHERE stock_name = '{}' AND exchange = '{}' AND ticker = '{}'".format(
                series['stock_name'],
                series['exchange'],
                series['ticker'])
            cursor.execute(stock_id_query)
            stock_id = cursor.fetchone()[0]

            # next, searchs the tag id of row.
            tag_id_query = "SELECT id FROM tag WHERE tag_name = '{}' AND category = '{}'".format(
                series[column_name], column_name)
            cursor.execute(tag_id_query)
            tag_id = cursor.fetchone()[0]

            # inserts stock_tag relation using stock_id and tag_id
            query = "INSERT INTO stock_tag (stock_id, tag_id) " \
                    "VALUES ('{}', '{}') ON CONFLICT DO NOTHING".format(stock_id, tag_id)
            cursor.execute(query)
    conn.commit()
예제 #2
0
def iine_country(cur: psycopg2.extensions.cursor, country: str) -> int:
    cur.execute("select id from countries where name=%s", (country, ))
    country_id = cur.fetchone()
    if country_id is None:
        cur.execute("insert into countries (name) values (%s) returning id",
                    (country, ))
        country_id = cur.fetchone()
    cur.connection.commit()
    return country_id
예제 #3
0
def iine_person_type(cur: psycopg2.extensions.cursor, actor_type: str) -> int:
    cur.execute("select id from person_types where type=%s", (actor_type, ))
    type_id = cur.fetchone()
    if type_id is None:
        cur.execute("insert into person_types (type) values (%s) returning id",
                    (actor_type, ))
        type_id = cur.fetchone()
    cur.connection.commit()
    return type_id
예제 #4
0
 def ExportSession(self, cursor: psycopg2.extensions.cursor):
     if not self.session_id:
         args = (self.session_time.ToJsonString(), self.track,
                 self.live_data)
         cursor.execute(SESSION_INSERT, args)
         self.session_id = cursor.fetchone()[0]
         self.timescale_conn.commit()
예제 #5
0
def check_and_mark_data_quality(cursor: psycopg2.extensions.cursor,
                                log: logger.Log) -> None:
    """
    Using `cursor`, mark rows in the stage.game_data table that
    satisfy data quality rules. Log to `log` a warning if any rows
    fail data quality.
    """
    quality_check_sql = '''
    UPDATE stage.game_data 
    SET passed_data_quality_check = True 
    WHERE game_id NOT IN
    (
    SELECT distinct(game_id)
    FROM stage.game_data
    WHERE move_number ~ '\D' -- Must be an integer
    OR "column" ~ '\D' -- Must be an integer
    OR "column"::int > 4  -- The grid is 4x4
    OR result NOT IN ('', 'win', 'draw') -- Valid values for result
    )
    AND game_id NOT IN
    (
    SELECT game_id
    FROM stage.game_data
    GROUP BY game_id
    HAVING COUNT(DISTINCT(player_id)) <> 2  -- A game should have 2 players
    );
    '''
    cursor.execute(quality_check_sql)

    cursor.execute("SELECT COUNT(*) FROM stage.game_data WHERE "
                   "passed_data_quality_check = false;")
    number_bad = cursor.fetchone()[0]
    if number_bad > 0:
        log.write_warning(
            f'Rejected {number_bad} game records due to data quality.')
예제 #6
0
def _get_business_id(cur: psycopg2.extensions.cursor, business_identifier: str) -> str:
    """Return the business id for the given identifier."""
    cur.execute(f"select id from businesses where identifier='{business_identifier}'")
    business_id = cur.fetchone()
    if not business_id:
        return ''
    return str(business_id[0])
예제 #7
0
def select_DiscordUser_from_database(member_id: int,
                                     cursor: psycopg2.extensions.cursor):
    try:
        cursor.execute(sql_select_DiscordUser, (member_id, ))
        return cursor.fetchone()
    except (Exception, psycopg2.Error) as error:
        print(f"Error while selecting from DiscordUser on database: {error}")
예제 #8
0
def process_log_file(cur: psycopg2.extensions.cursor, filepath: str):
    """
    Given a connection (cursor) to a PostgreSQL database and a path to a JSON-(log)-file,
    load the file, filter its content, derive datetime attributse from the timestamp column
    and insert valid subsets of its data into the time, user and songplay tables.
    
    :param cur: Cursor
    :param filepath: Path to JSON file
    """
    # open log file
    df = pd.read_json(filepath, orient="records", lines=True)

    # filter by NextSong action
    df = df.loc[(df["page"] == "NextSong"), :]

    # convert timestamp column to datetime
    df["ts"] = df["ts"].astype("datetime64[ms]")

    # insert time data records
    time_dict = {
        "timestamp": df.ts,
        "hour": df.ts.dt.hour,
        "day": df.ts.dt.day,
        "week_of_year": df.ts.dt.weekofyear,
        "month": df.ts.dt.month,
        "year": df.ts.dt.year,
        "weekday": df.ts.dt.weekday
    }

    time_df = pd.DataFrame.from_dict(time_dict)

    for i, row in time_df.iterrows():
        cur.execute(time_table_insert, list(row))

    # load user table
    user_df = df[["userId", "firstName", "lastName", "gender", "level"]]

    # insert user records
    for i, row in user_df.iterrows():
        cur.execute(user_table_insert, row)

    # insert songplay records
    for index, row in df.iterrows():

        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()

        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = (row.ts, row.userId, row.level, songid, artistid,
                         row.sessionId, row.location, row.userAgent)
        cur.execute(songplay_table_insert, songplay_data)
예제 #9
0
def verify_tables(cur: psycopg2.extensions.cursor) -> bool:
    """
    Check if products table already exist
    :return: boolean if product table exist
    """
    cur.execute(
        f"select exists(select * from information_schema.tables where table_name=%s)",
        (DB_TABLE, ),
    )

    return cur.fetchone()[0]
예제 #10
0
def source_at_offset(
    cursor: psycopg2.extensions.cursor, source_info: SourceInfo
) -> typing.Union[None, int]:
    """Return the mz timestamp from a source if it has reached the desired offset."""
    query = (
        'SELECT timestamp FROM mz_source_info WHERE source_name = %s and "offset" = %s'
    )
    try:
        cursor.execute(query, (source_info.topic_name, source_info.offset))  # type: ignore
        if cursor.rowcount > 1:
            print("ERROR: More than one row returned when querying source offsets:")
            for row in cursor:
                print(f"\t{row}")
            sys.exit(1)
        if not cursor.rowcount:
            return None

        return int(cursor.fetchone()[0])
    except psycopg2.errors.InternalError_:  # type: ignore
        # The view is not yet ready to be queried
        return None
def check_and_mark_data_quality(cursor: psycopg2.extensions.cursor,
                                log: logger.Log) -> None:
    """
    Using `cursor`, mark rows in the stage.player_info table that
    satisfy data quality rules. Log to `log` a warning if any rows
    fail data quality.
    """
    quality_check_sql = """
    UPDATE stage.player_info
    SET passed_data_quality_check = true
    WHERE details ? 'data';
    """

    cursor.execute(quality_check_sql)

    cursor.execute("SELECT COUNT(*) FROM stage.player_info WHERE "
                   "passed_data_quality_check = false;")
    number_bad = cursor.fetchone()[0]
    if number_bad > 0:
        log.write_warning(
            f'Rejected {number_bad} player records due to data quality.')
예제 #12
0
def get_song_artist(cur: psycopg2.extensions.cursor, song: str, artist: str,
                    length: float) -> Optional[Tuple[str, str]]:
    """Fetches the Song ID and Artist ID associated with the provided song (name), artist (name) and song length.

    Args:
        cur (psycopg2.extensions.cursor): The active cursor used to fetch the data
        song (str): The name of the song to search
        artist (str): The name of the artist to search
        length (float): The length of the song to be searched

    Returns:
        Optional[Tuple[str, str]]: None if the arguments are invalid, or a Tuple containing the retrieved Song ID and Artist ID
    """

    # get songid and artistid from song and artist tables
    cur.execute(song_select, (song, artist, length))
    results = cur.fetchone()

    if results:
        songid, artistid = results
    else:
        songid, artistid = None, None

    return songid, artistid
예제 #13
0
 def ExportLap(self, lap: gps_pb2.Lap, cursor: psycopg2.extensions.cursor):
     """Export the lap data to timescale."""
     args = (self.session_id, lap.number)
     cursor.execute(LAP_INSERT, args)
     self.lap_number_ids[lap.number] = cursor.fetchone()[0]
     self.timescale_conn.commit()
예제 #14
0
def process_queue(*, curs: psycopg2.extensions.cursor,
                  area_root: pathlib.Path) -> None:
    # loop until the queue is empty
    while True:
        curs.execute('BEGIN;')

        curs.execute('''
            DELETE
            FROM public.queue
            WHERE id = (
                SELECT id
                FROM public.queue
                ORDER BY priority DESC, ts
                    FOR UPDATE
                        SKIP LOCKED
                LIMIT 1
            )
            RETURNING id, run, student_id, area_catalog, area_code, input_data::text, expires_at, link_only;
        ''')

        # fetch the next available queued item
        row = curs.fetchone()

        # if there are no more, return to waiting
        if row is None:
            curs.execute('COMMIT;')
            break

        try:
            queue_id, run_id, student_id, area_catalog, area_code, input_data, expires_at, link_only = row
        except ValueError as exc:
            curs.execute('COMMIT;')
            logger.exception(
                'unexpected exception: wrong number of items in tuple from queue table - %s',
                exc)
            break

        area_id = area_catalog + '/' + area_code
        try:
            logger.info(f'[q={queue_id}] begin  {student_id}::{area_id}')

            catalog_int = int(area_catalog.split('-')[0])

            area_file = find_area(root=area_root,
                                  area_catalog=catalog_int,
                                  area_code=area_code)
            if not area_file:
                logger.error(
                    'could not find area spec for %s at or below catalog %s (%s), under %s',
                    area_code, area_catalog, catalog_int, area_root)
                continue

            area_spec = load_area(area_file)

            # run the audit
            audit(
                curs=curs,
                student=json.loads(input_data),
                area_spec=area_spec,
                area_catalog=area_catalog,
                area_code=area_code,
                run_id=run_id,
                expires_at=expires_at,
                link_only=link_only,
            )

            # once the audit is done, commit the queue's DELETE
            curs.execute('COMMIT;')

            logger.info(f'[q={queue_id}] commit {student_id}::{area_id}')

        except Exception as exc:
            # commit the deletion, just so it doesn't endlessly re-run itself
            curs.execute('COMMIT;')

            # log the exception
            logger.error(f'[q={queue_id}] error  {student_id}::{area_id}; %s',
                         exc)

    logger.info('queue is empty')
예제 #15
0
def process_queue(*, curs: psycopg2.extensions.cursor, area_root: str) -> None:
    # loop until the queue is empty
    while True:
        curs.execute('BEGIN;')

        curs.execute('''
            DELETE
            FROM public.queue
            WHERE id = (
                SELECT id
                FROM public.queue
                ORDER BY priority DESC, ts
                    FOR UPDATE
                        SKIP LOCKED
                LIMIT 1
            )
            RETURNING id, run, student_id, area_catalog, area_code, input_data::text;
        ''')

        # fetch the next available queued item
        row = curs.fetchone()

        # if there are no more, return to waiting
        if row is None:
            curs.execute('COMMIT;')
            break

        try:
            queue_id, run_id, student_id, area_catalog, area_code, input_data = row
        except Exception:
            curs.execute('COMMIT;')
            break

        try:
            area_id = area_catalog + '/' + area_code
            area_path = os.path.join(area_root, area_catalog,
                                     area_code + '.yaml')

            logger.info(f'[q={queue_id}] begin  {student_id}::{area_id}')

            area_spec = load_areas(area_path)[0]

            # run the audit
            audit(
                curs=curs,
                student=json.loads(input_data),
                area_spec=area_spec,
                area_catalog=area_catalog,
                area_code=area_code,
                run_id=run_id,
            )

            # once the audit is done, commit the queue's DELETE
            curs.execute('COMMIT;')

            logger.info(f'[q={queue_id}] commit {student_id}::{area_id}')

        except Exception as exc:
            # commit the deletion, just so it doesn't endlessly re-run itself
            curs.execute('COMMIT;')

            # record the exception in Sentry for debugging
            sentry_sdk.capture_exception(exc)

            # log the exception
            logger.error(f'[q={queue_id}] error  {student_id}::{area_id}')

    logger.info(f'queue is empty')
예제 #16
0
def process_log_file(cur: psycopg2.extensions.cursor, filepath: str):
    """Process log file and insert into database

    Args:
        cur (psycopg2.extensions.cursor): postgres cursor
        filepath (str): log filepath
    """
    # open log file
    df = pd.read_json(filepath, lines=True)

    # filter by NextSong action
    df = df[df.page == "NextSong"]

    # convert timestamp column to datetime
    t = pd.to_datetime(df.ts, unit="ms")

    # insert time data records
    time_data = list((
        list(t.values),
        list(t.dt.hour.values),
        list(t.dt.day.values),
        list(t.dt.week.values),
        list(t.dt.month.values),
        list(t.dt.year.values),
        list(t.dt.weekday.values),
    ))
    column_labels = ("start_time", "hour", "day", "week", "month", "year",
                     "weekday")

    # create time df from dictionary
    time_df = pd.DataFrame({k: v for k, v in zip(column_labels, time_data)})

    for _, row in time_df.iterrows():
        cur.execute(time_table_insert, list(row))

    # load user table
    user_df = df[["userId", "firstName", "lastName", "gender", "level"]]

    # insert user records
    for _, row in user_df.iterrows():
        cur.execute(user_table_insert, row)

    # insert songplay records
    for _, row in df.iterrows():

        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()

        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = (
            pd.to_datetime(row.ts),
            row.userId,
            row.level,
            songid,
            artistid,
            row.sessionId,
            row.location,
            row.userAgent,
        )
        cur.execute(songplay_table_insert, songplay_data)