Exemplo n.º 1
0
def init_tables(cur: psycopg2.connect) -> None:
    fixtures_init_path = os.path.join(FIXTURES_PATH, "init")
    fixtures = [os.path.join(fixtures_init_path, f) for f in listdir(fixtures_init_path)]
    for fixture in fixtures:
        with open(fixture, 'r') as f:
            sql = f.read().strip()
            cur.execute(sql)
Exemplo n.º 2
0
def drop_tables(cur: psycopg2.connect) -> None:
    tables = [
        "Employees", "OrderDetails", "Categories", "Customers", "Orders",
        "Products", "Shippers", "Suppliers"
    ]
    for table in tables:
        cur.execute(f"DROP TABLE {table.lower()};")
Exemplo n.º 3
0
def process_song_file(cur: psycopg2.connect, filepath: str) -> None:
    """Extract and load data for song and artist from log files.

    Parameters
    ----------
    cur: psycopg2.connect
        Psycopg2 database cursor for inserting data.

    filepath: str
        Path for log file.

    """

    # open song file
    df = pd.read_json(filepath, lines=True)

    # insert song record
    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']
    song_data = song_data = df[song_columns].values[0].tolist()
    cur.execute(song_table_insert, song_data)

    # insert artist record
    artist_columns = [
        'artist_id',
        'artist_name',
        'artist_location',
        'artist_latitude',
        'artist_longitude',
    ]
    artist_data = df[artist_columns].values[0].tolist()

    cur.execute(artist_table_insert, artist_data)
Exemplo n.º 4
0
def clear_tables(cur: psycopg2.connect) -> None:
    tables = [
        "Employees", "OrderDetails", "Categories", "Customers", "Orders",
        "Products", "Shippers", "Suppliers"
    ]
    for table in tables:
        query = f"DELETE FROM {table.lower()} WHERE TRUE ;"
        cur.execute(query)
Exemplo n.º 5
0
def drop_tables(cur: psycopg2.connect, conn: psycopg2.connect) -> None:
    """
    Drops each table using the queries in `drop_table_queries` list.

    Parameters
    ----------

    cur : psycopg2.cursor
        Cursor for accessing database with psycopg.

    conn : psycopg2.connect
        Database connection instance.
    """
    for query in drop_table_queries:
        cur.execute(query)
        conn.commit()
Exemplo n.º 6
0
def clear_tables(cur: psycopg2.connect) -> None:
    tables = ["Tasks", "Dates"]
    for table in tables:
        query = f"DELETE FROM {table.lower()} WHERE TRUE ;"
        cur.execute(query)
Exemplo n.º 7
0
def drop_tables(cur: psycopg2.connect) -> None:
    tables = ["Tasks", "Dates"]
    for table in tables:
        cur.execute(f"DROP TABLE {table.lower()};")
Exemplo n.º 8
0
def process_log_file(cur: psycopg2.connect, filepath: str) -> None:
    """Extract and load data for song and artist from log files.

    Parameters
    ----------
    cur: psycopg2.connect
        Psycopg2 database cursor for inserting data.

    filepath: str
        Path for log file.

    """

    # open log file
    df = pd.read_json(filepath, lines=True, convert_dates=['ts'])

    # filter by NextSong action
    df = df[df['page'] == 'NextSong']

    # convert timestamp column to datetime
    t = pd.to_datetime(df['ts'], unit='ms')

    # insert time data records
    time_data = [[x, x.hour, x.day, x.week, x.month, x.year, x.dayofweek]
                 for x in t]
    column_labels = [
        'start_time',
        'hour',
        'day',
        'week',
        'month',
        'year',
        'weekday',
    ]
    time_df = pd.DataFrame(time_data, columns=column_labels)

    for i, row in time_df.iterrows():
        cur.execute(time_table_insert, list(row))

    # load user table
    user_df = df.filter(['userId', 'firstName', 'lastName', 'gender',
                         'level']).drop_duplicates()

    # insert user records
    for i, row in user_df.iterrows():
        cur.execute(user_table_insert, row)

    # insert songplay records
    for index, row in df.iterrows():
        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()

        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = [
            row.ts,
            row.userId,
            row.level,
            songid,
            artistid,
            row.sessionId,
            row.location,
            row.userAgent,
        ]
        cur.execute(songplay_table_insert, songplay_data)