Exemplo n.º 1
def schema_song_data():
        Schema design for song datasets.
        print("schema_song_data fuction is statrting.")

        schema = R([
            Fld("artist_id", Str()),
            Fld("artist_latitude", Dbl()),
            Fld("artist_location", Str()),
            Fld("artist_longitude", Dbl()),
            Fld("artist_name", Str()),
            Fld("duration", Dbl()),
            Fld("num_songs", Int()),
            Fld("song_id", Str()),
            Fld("title", Str()),
            Fld("year", Int()),

        print("schema_song_data is successfull created")
        return schema

        print("schema_song_data function is successful created.")
Exemplo n.º 2
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    songSchema = R([
    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id","year", "duration"]
    # extract columns to create songs table
    songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]

    # extract columns to create artists table
    artists_table = df.selectExpr(artists_fields).dropDuplicates()
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
Exemplo n.º 3
def process_song_data(spark, input_data, output_data):
        Description: This function can be used to load the song data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())

    df = spark.read.json(song_data, schema=songsSchema).distinct()
    print(df.show(5, truncate=False))


    # extract columns to create songs table

    songs_table = df.select("song_id", "title", "artist_id", "year",
    print('songs', songs_table.count())

    # write songs table to parquet files partitioned by year and artist
        "year", "artist_id").parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = spark.sql(
        "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df"
    print('artists', artists_table.count())

    # write artists table to parquet files
        "artist_id").write.mode('overwrite').parquet(output_data + "artists")
Exemplo n.º 4
def process_song_data(spark, input_data, output_data):
    """ Process song_data json files which located in S3
        Create table songs_table and artists_table
        Store the table in parque format in S3
        Return the table to be used in process_log_data function
      spark                 : Spark Session
      input_data  (string)  : location json files (input)
      output_data (string)  : location parque files (output)
      songs_data    (Spark Dataframe) : Song Data tables

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # set schema song data
    songSchema = StructType([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int()),

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \
                    .where("song_id is not null") \

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        os.path.join(output_data, 'songs'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select(col("artist_id"),
                              col("artist_longitude").alias("longitude")) \
                      .where("artist_id is not null") \

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),

    # return song_data table to be used in process_log_data
    return df
Exemplo n.º 5
def get_log_schema():
    Creates a schema for log data.
    :return: schema
    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Str()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Str()),
        Fld("song", Str()),
        Fld("status", Str()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    return log_schema
Exemplo n.º 6
def process_song_data(spark, input_data, output_data):
    """This function loads song_data from S3 and processes it by extracting the songs and artist tables
        and then again loaded back to S3
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    print("**** Starting to process song data *****")
    # get filepath to song data file
    song_data = input_data+'song_data/*/*/*/*.json'
    # read song data file
    songSchema = R([
        df = spark.read.json(song_data, schema=songSchema)
    except Exception as e:
    # extract columns to create songs table
    songs_fields = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(songs_fields).dropDuplicates(["song_id"])
    # write songs table to parquet files partitioned by year and artist
        songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite")
    except Exception as e:
    print("**** songs table data load is complete *****")
    # extract columns to create artists table
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"]
    artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"])
    # write artists table to parquet files
        artists_table.write.parquet(output_data + "artists.parquet",  mode="overwrite")
    except Exception as e:
    print("**** artists table data load is complete *****")
    print("**** song data processing is finished *****")
Exemplo n.º 7
def process_song_data(spark, input_data, output_data):
    """import Song dataset extract columns and create songs and artist tables
    write those tables to parquet files
    spark: name of spark session
    input_data: location of the source data s3 bucket 
    output_data: location of the destination data s3 bucket
    writes songs table in parquet to output_data location + songs
    writes artist_table in parquet to output_dat location + artists

    # Setting up the JSON table structure for the Song dataset
    song_dataset_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Str()),
    """get filepath to song data file 
    use "song_data/*/*/*/*.json" for full dataset
    use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record

    song_data = input_data + "song_data/*/*/*/*.json"

    # read song data file with dataset_schema
    df = spark.read.json(song_data, schema=song_dataset_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'artist_id', 'year', 'duration')

    # drop duplicate rows in songs table
    songs_table = songs_table.dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
        'year', 'artist_id').parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude')

    # drop duplicate rows in artists table
    artists_table = artists_table.dropDuplicates()

    # write artists table to parquet files
    artists_table.write.mode('append').parquet(output_data + "artists")
Exemplo n.º 8
def process_song_data(spark, input_data, output_data):
    load song data in json format from S3 bucket and process these data by extracting 
    songs table and artists table, and save these tables back to S3 bucket
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # create songs schema
    songSchema = R([
        Fld('artist_id', Str()),
        Fld('artist_latitude', Dbl()),
        Fld('artist_location', Str()),
        Fld('artist_longitude', Dbl()),
        Fld('artist_name', Str()),
        Fld('duration', Dbl()),
        Fld('num_songs', Int()),
        Fld('title', Str()),
        Fld('year', Int()),

    # load songs json files from S3
    df_songs = spark.read.json(song_data, schema=songSchema)

    # select columns for songs_table
    songs_attr = ['title', 'artist_id', 'year', 'duration']
    songs_table = df_songs.select(songs_attr)\
    .withColumn('song_id', monotonically_increasing_id())

    # write songs_table to S3
                                  'artist_id').parquet(output_data + 'songs/')

    # select artists columns
    artists_attr = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
    artists_table = df_songs.select(artists_attr)\

    artists_table = artists_table\

    # write artists_table to S3
    artists_table.write.parquet(output_data + 'artists/')
Exemplo n.º 9
def process_song_data(spark, input_data, output_data):
    This function loads the songs JSON dataset from S3, 
    then uses the data to create the songs and artists tables
    spark = SparkSession object
    input_data = Start of path variable for input files
    output_data = Start of path variable for output files
    Output: None

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Define schema
    SongSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",

    # write songs table to parquet files partitioned by year and artist
    output_path = os.path.join(output_data, 'songs_table.parquet')

    # extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location",

    # write artists table to parquet files
    output_path = os.path.join(output_data, 'artists_table.parquet')
    artists_table.write.parquet(output_path, mode="overwrite")

    #export whole songs data file to parquet
    output_path = os.path.join(output_data, 'songs_data_table.parquet')
    df.write.parquet(output_path, mode="overwrite")
Exemplo n.º 10
def process_song_data(spark, input_data, output_data):
    """Process song data, transform the data into songs and artists tables
    and store it in parquet files on S3.

    spark : SparkSession
        cursor to the sparkify database connection
    input_data : string
        input data prepend path
    output_data : string
        output data prepend path
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int())

    # read song data file
    df = spark.read.json(song_data, song_schema)

    # extract columns to create songs table
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \
        .parquet(os.path.join(output_data, 'analytics/songs'))

    # extract columns to create artists table
    artists_table = df.select([
        "artist_id", "artist_name", "artist_location", "artist_latitude",
    artists_table = artists_table.withColumnRenamed("artist_name", "name") \
        .withColumnRenamed("artist_location", "location") \
        .withColumnRenamed("artist_latitude", "latitude") \
        .withColumnRenamed("artist_longitude", "longitude")

    # write artists table to parquet files
    artists_table.write.mode("overwrite") \
        .parquet(os.path.join(output_data, 'analytics/artists'))
Exemplo n.º 11
def process_song_data(spark, input_data, output_data):

    print('%%%%% Starting up the SONG data process')

    # get filepath to song data file
    song_data = 'song_data/A/*/*/*.json'

    # setting up the schema for the data that we're about to pull
    songSchema = ST([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())

    # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally
    #df = spark.read.json(input_data + song_data)
    raw_song_df = spark.read.json(input_data + song_data, songSchema)

    # extract columns to create songs table
    songs_table = raw_song_df.select(raw_song_df.song_id, \
                                 raw_song_df.title, \
                                 raw_song_df.artist_id, \
                                 raw_song_df.year.cast(Int()), \

    # write songs table to parquet files partitioned by year and artist
        'year', 'artist_id').parquet(output_data + 'songs')

    print('%%%%% Songs table has been created and written to the S3 Bucket')

    # extract columns to create artists table
    artists_table = raw_song_df.select(raw_song_df.artist_id , \
                                  raw_song_df.artist_latitude.alias('latitude'), \
                                  raw_song_df.artist_location.alias('location'), \
                                  raw_song_df.artist_longitude.alias('longitude'), \

    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artist')

    print('%%%%% Artists table has been created and written to the S3 Bucket')
    print('%%%%% SONG data has been completed and returning the raw_song_df')
    return raw_song_df
Exemplo n.º 12
def process_song_data(spark, input_data, output_data):
    The function loads data from song_data dataset and extract columns
    for songs and artist tables and write the data into parquet
    files which will be loaded on s3.
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())

    # get filepath to song data file
    song_data = 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(os.path.join(input_data, song_data),

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year',


    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs/songs.parquet'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                      .withColumnRenamed('artist_name', 'name') \
                      .withColumnRenamed('artist_location', 'location') \
                      .withColumnRenamed('artist_latitude', 'latitude') \
                      .withColumnRenamed('artist_longitude', 'longitude') \


    # write artists table to parquet files
        os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
Exemplo n.º 13
def process_song_data(spark, input_data, output_data):
    Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables
    and then loaded the processed data back to S3 (output_data)
    :param spark: Spark Session object
    :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files
    :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format 

    # Get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),

    # Read song data file
    print("Reading song_data JSON files from S3")
    df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \
    print("Read completed")

    # Extract columns to create songs table
    songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \
                    .withColumn("song_id", monotonically_increasing_id())

    print("Writing Songs table to S3 after processing")
    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs/",
                              partitionBy=["year", "artist_id"])

    # Extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \

    print("Writing Artists table to S3 after processing")
    # Write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
Exemplo n.º 14
def process_song_data(spark, input_data, output_data):
    read song data from s3 and then create the songs_table and artists_table. load them back to s3.
    spark: spark session
    input_data: path of song data
    output_data: path of output table
    # get filepath to song data file
    # song_data = input_data + "song_data/*/*/*/*.json"
    song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json"

    # create song table schema
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0
    song_field = ["title", "duration", "year", "artist_id"]
    songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\
    .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

    # extract columns to create artists table, drop if artist_id and name containing any null values
    artist_field = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
    artists_table = df.select(artist_field).dropDuplicates().dropna(
        subset=["artist_id", "artist_name"])

    # write songs table to parquet files partitioned by year and artist
                                  "artist_id").parquet(output_data + "songs/",

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
def process_song_data(spark, input_data, output_data):
    Method to process song data and create tables: songs, artists
    :param spark: Spark session
    :param input_data: S3 bucket
    :param output_data: S3 bucket
    :return: Data frame of song data
    # get filepath to song data file
    song_data = input_data + '/song-data/A/A/B/*.json'

    songs_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())

    # read song data file
    print('Reading song data.')
    df = spark.read.json(song_data, schema=songs_schema)

    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']

    # extract columns to create songs table
    songs_table = df.selectExpr(song_columns).dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    print('Writing songs to parquet.')
    write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id')

    artist_columns = [
        'artist_id', 'artist_name as name', 'artist_location as location',
        'artist_latitude as latitude', 'artist_longitude as longitude'

    # extract columns to create artists table
    artists_table = df.selectExpr(artist_columns).dropDuplicates()

    # write artists table to parquet files
    print('Writing artists to parquet.')
    write_parquet(artists_table, output_data, 'artists', None, None)

    return df
Exemplo n.º 16
def process_song_data(spark, input_data, output_data):
        Description: This function fetches song_data from S3 into a staging dataframe, 
        then extracts the songs and artist tables,
        and eventually exports data back to S3
            spark       : object for Spark Session
            input_data  : location of song_data 
            output_data : location of target S3 bucket

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    # define schema
    songdata_schema = R([
    # read song data file
    df = spark.read.json(song_data, schema=songdata_schema)

    # extract columns to create songs table
    songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration'])

    songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\
    select(['song_id', 'artist_name', 'artist_id', 'year', 'duration'])
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/')

    # extract columns to create artists table
    selection = ['artist_id', 'artist_name as name', \
                 'artist_location as location', 'artist_latitude as latitude', \
                 'artist_longitude as longitude']
    artists_table = df.selectExpr(selection).dropDuplicates()
    # write artists table to parquet files
    artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
Exemplo n.º 17
def process_song_data(spark, input_data, output_data):
    process_song_data - Loads the song data files from S3, and saves the song information to a parquet file
    (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file.

    # Get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    #    song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json')

    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())

    # Read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # Extract columns to create songs table
    songs_table = df.select(
        ['song_id', 'title', 'artist_id', 'year', 'duration'])

    # Write songs table to parquet files partitioned by year and artist

    # Extract columns to create artists table, and find the distinct artists
    artists_table = df.select([
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
    ]).withColumnRenamed('artist_name', 'name').withColumnRenamed(
        'artist_location', 'location').withColumnRenamed(

    # Write artists table to parquet files
        os.path.join(output_data, 'artists.parquet'))
Exemplo n.º 18
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + "./data/song_data/*/*/*/*.json"
    """Creating the song_data file schema that we are going to add to spark"""
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),

    # reading song data file json structure
    df = spark.read.json(song_data, schema=songSchema)
    """Filtering out only the needed columns for the songs table"""
    song_fields = ["title", "artist_id", "year", "duration"]

    print('Creating the songs table and dropping duplicates')
    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())
        "--- All duplicate songs have been dropped and the songs table created ---"
    print('Printing some rows from the songs_table')
    print('Saving the songs table to the s3 bucket')
                                  'artist_id').parquet(output_data + "songs")
    print("--- songs.parquet completed ---")
    """Filtering out only the needed columns for the artists table"""
    artists_data = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',

    print("--- Starting to drop duplicate artists....")
    artists_table = df.selectExpr(artists_data).dropDuplicates()
    print("All duplicate artists have been dropped......")

    print('Printing some rows from the artists_table')
    """writing the artists table to the parquets file"""
    artists_table.write.parquet(output_data + "artists")
    print("--- artists.parquet completed ---")
    print("*** process_song_data completed ***\n\n")
Exemplo n.º 19
def process_song_data(spark, input_data, output_data):
		Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' 
		and writting their parquet format on S3
			spark       : Spark Session
			input_data  : Location of song_data json files with the songs metadata
			output_data : S3 bucket were tables in parquet format store

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Making right type for input json structure
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = songs_table = df.selectExpr(
        "song_id", "title", "artist_id", "year",

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs'))

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name",
                                  "artist_location as location",
                                  "artist_latitude as latitude",
                                  "artist_longitude as longitude").orderBy(

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'))
Exemplo n.º 20
def create_log_schema():
    Schema structure for log data
    :return: StructType
    log_schema = R([
        Fld('artist', St()),
        Fld('auth', St()),
        Fld('firstName', St()),
        Fld('gender', St()),
        Fld('itemInSession', LInt()),
        Fld('lastName', St()),
        Fld('length', Fl()),
        Fld('level', St()),
        Fld('location', St()),
        Fld('method', St()),
        Fld('page', St()),
        Fld('registration', Dbl()),
        Fld('sessionId', LInt()),
        Fld('song', St()),
        Fld('status', LInt()),
        Fld('ts', LInt()),
        Fld('userAgent', St()),
        Fld('userId', St())

    return log_schema
Exemplo n.º 21
def get_log_src_schema():
    Get the source spark schema definition
    :return: The schema definition

    return R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Str()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Str()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
Exemplo n.º 22
def create_log_data():
    Create schema for log data.
    return: schema
    log_schema = StructType([
        StructField("artist", Str()), 
        StructField('auth', Str()),
        StructField('firstName', Str()),
        StructField('gender', Str()),
        StructField('itemInSession', Int()),
        StructField('lastName', Str()),
        StructField('length', Dbl()),
        StructField('level', Str()),
        StructField('location', Str()),
        StructField('method', Str()),
        StructField('page', Str()),
        StructField('registration', Dec()),
        StructField('sessionId', Int()),
        StructField('song', Str()),
        StructField('status', Int()),
        StructField('ts', Long()),
        StructField('userAgent', Str()),
        StructField('userId', Int())
    return log_schema
Exemplo n.º 23
def process_song_data(spark, input_data_songs, output_data):
    Read song data by providing it an expected schema.
    Create songs and artists tables.
    # define song data schema to improve performance
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())

    song_data = input_data_songs

    t_start = time()
    dfSongs = spark.read.json(song_data, schema=song_schema)
    t_end = time() - t_start
    print('Read song data in {} secs'.format(t_end))


    songs_table = dfSongs.filter(dfSongs.song_id != '')\
                     .select(['song_id', 'title', 'artist_id', 'year', 'duration'])
        "artist_id").mode('overwrite').parquet(output_data +

    artists_table = dfSongs.filter(dfSongs.artist_id !='') \
                        .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"),
                                 col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\


        output_data + 'artists/artists_table.parquet')
Exemplo n.º 24
def process_song_data(spark, input_data, output_data):
    Reads from song files, 
    transforms them into songs and artists data, 
    and writes them in parquet format.
    - spark: spark session object
    - input_data: input data path
    - output_data: output data path

    # get filepath to song data file
    song_data = input_data + "/song_data/*/*/*/*.json"

    # use schema when read json files
    song_schema = St([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())

    # read song data file
    df = spark.read.json(song_data, schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs", mode="overwrite", \
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \
                                  "artist_latitude as latitude", "artist_longitude as longitude") \

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists", mode="overwrite")
Exemplo n.º 25
def get_song_src_schema():
    Get the source spark schema definition
    :return: The schema definition
    return R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
Exemplo n.º 26
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + '*/*/*/*.json'

    # creating schema fo our song_data

    songs_model = St([
        Sfld("num_songs", Int()),
        Sfld("artist_id", Str()),
        Sfld("artist_latitude", Dbl()),
        Sfld("artist_longitude", Dbl()),
        Sfld("artist_location", Str()),
        Sfld("artist_name", Str()),
        Sfld("song_id", Str()),
        Sfld("title", Str()),
        Sfld("duration", Dbl()),
        Sfld("year", Int())

    # read song data file
    df = spark.read.json(song_data, schema=songs_model)

    # extract columns to create songs table
    songs_table = df.select(["title", "artist_id", "year",
                                 "song_id", monotonically_increasing_id())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        output_data + 'songs_table/', mode='overwrite')

    # extract columns to create artists table
    artists_fields = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
    artists_table = df.select(artists_fields).withColumnRenamed(
        'artist_name', 'name').withColumnRenamed(
            'artist_location', 'location').withColumnRenamed(

    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists_table/',
Exemplo n.º 27
def process_song_data(spark, input_data, output_data):
    Extract data from song_data and write songs and artists table
    - spark : SparkSession object
    - input_data : input data root dir path
    - output_data : output data root dir path
    # schema for song_data 
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*"
    # read song data file
    df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"])

    # extract columns to create songs table
    songs_table = spark.sql("""
        SELECT song_id, title, artist_id, year, duration FROM song_data
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite')

    # extract columns to create artists table
    artists_table = spark.sql("""
        SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude 
        FROM song_data
    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table")
Exemplo n.º 28
def process_song_data(spark, input_data, output_data):
        The funtion process song data using spark on AWS.
              song_log in  .json format, 
              Processed data in parquet format loaded back to S3
              Spark session, input_data, output_data

    song_data = input_data + 'song_data/*/*/*/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

                                  "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.parquet(output_data + 'artists/')
Exemplo n.º 29
def process_song_data(spark, input_data, output_data):
                Function that processes the raw data from the S3 bucket
                :spark:       uses the earlier instantiated spark session
                :input_data:  path of the location where the files are residing
                :output_data: path of the location where the files will be saved after processing
                :return:      none

    song_data = input_data + 'song_data/A/A/A/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

        "year", "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.mode('overwrite').parquet(output_data + 'artists/')
Exemplo n.º 30
def process_song_data(spark, input_data, output_data):
    This function processes the song data of sparkify and creates
    facts/dimensions via spark and saves them to our data lake afterwards
	    spark {SparkSession}: Spark session to launch the program
	    input_data {str}: location (local/s3) where the (root) input song data resides
	    output_data {str}: location (local/s3) where the (root) output files should be written
    # get filepath to song data file
    # song_data = f"{input_data}song_data/A/A/A/*.json"
    song_data = f"{input_data}song_data/*/*/*/*.json"

    # read song data file
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Str()),
        Fld("artist_longitude", Str()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),

    # since schema can not infered automatically, we need to specify it beforehand
    df_song = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = df_song.filter(df_song.song_id != '') \
                 'duration']) \

    # write songs table to parquet files partitioned by year and artist
    output_song_data = f"{output_data}song_data/"
        "year", "artist_id").parquet(output_song_data)

    # extract columns to create artists table
    artists_table = df_song.filter(df_song.artist_id != '') \
                     'artist_name as name',
                     'artist_location as location',
                     'artist_latitude as latitude',
                     'artist_longitude as longitude']) \

    # write artists table to parquet files
    output_artist_data = f"{output_data}artist_data/"