def schema_song_data(): """ Description: Schema design for song datasets. """ try: print("schema_song_data fuction is statrting.") print("**************************************") schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()), ]) print("schema_song_data is successfull created") print("***************************************") return schema except: print("schema_song_data function is successful created.") print("************************************************")
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) song_fields = ["title", "artist_id","year", "duration"] # extract columns to create songs table songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id()) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"] # extract columns to create artists table artists_table = df.selectExpr(artists_fields).dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): ''' Description: This function can be used to load the song data from the input S3 bucket and write the parquet files to the output S3 bucket. Arguments: spark: SparkSession input_data: location for the input data output_data: location for the output data Returns: None ''' # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") print(song_data) # read song data file songsSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) df = spark.read.json(song_data, schema=songsSchema).distinct() print(df.count()) print(df.show(5, truncate=False)) df.printSchema() # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").distinct() songs_table.printSchema() songs_table.show(5) print('songs', songs_table.count()) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_data + "songs") # extract columns to create artists table df.createOrReplaceTempView("df") artists_table = spark.sql( "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df" ).distinct() artists_table.printSchema() artists_table.show(5) print('artists', artists_table.count()) # write artists table to parquet files artists_table.repartitionByRange( 3, "artist_id").write.mode('overwrite').parquet(output_data + "artists")
def process_song_data(spark, input_data, output_data): """ Process song_data json files which located in S3 Create table songs_table and artists_table Store the table in parque format in S3 Return the table to be used in process_log_data function Args: spark : Spark Session input_data (string) : location json files (input) output_data (string) : location parque files (output) Returns: songs_data (Spark Dataframe) : Song Data tables """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # set schema song data songSchema = StructType([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \ .where("song_id is not null") \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet( os.path.join(output_data, 'songs'), 'overwrite') # extract columns to create artists table artists_table = df.select(col("artist_id"), col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_latitude").alias("latitude"), col("artist_longitude").alias("longitude")) \ .where("artist_id is not null") \ .dropDuplicates(['artist_id']) # write artists table to parquet files artists_table.write.parquet(os.path.join(output_data, 'artists'), 'overwrite') # return song_data table to be used in process_log_data return df
def get_log_schema(): """ Creates a schema for log data. :return: schema """ log_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Str()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Str()), Fld("song", Str()), Fld("status", Str()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()) ]) return log_schema
def process_song_data(spark, input_data, output_data): """This function loads song_data from S3 and processes it by extracting the songs and artist tables and then again loaded back to S3 Args: spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession input_data (str): S3 bucket where song files are stored output (str): S3 bucket file path to store resulting files Returns: None """ print("**** Starting to process song data *****") # get filepath to song data file song_data = input_data+'song_data/*/*/*/*.json' # read song data file songSchema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("song_id",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) try: df = spark.read.json(song_data, schema=songSchema) except Exception as e: print(e) # extract columns to create songs table songs_fields = ["song_id", "title", "artist_id", "year", "duration"] songs_table = df.select(songs_fields).dropDuplicates(["song_id"]) # write songs table to parquet files partitioned by year and artist try: songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite") except Exception as e: print(e) print("**** songs table data load is complete *****") # extract columns to create artists table artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"] artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"]) # write artists table to parquet files try: artists_table.write.parquet(output_data + "artists.parquet", mode="overwrite") except Exception as e: print(e) print("**** artists table data load is complete *****") print("**** song data processing is finished *****")
def process_song_data(spark, input_data, output_data): """import Song dataset extract columns and create songs and artist tables write those tables to parquet files Parameters: spark: name of spark session input_data: location of the source data s3 bucket output_data: location of the destination data s3 bucket Returns: writes songs table in parquet to output_data location + songs writes artist_table in parquet to output_dat location + artists """ # Setting up the JSON table structure for the Song dataset song_dataset_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Str()), ]) """get filepath to song data file use "song_data/*/*/*/*.json" for full dataset use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record """ song_data = input_data + "song_data/*/*/*/*.json" # read song data file with dataset_schema df = spark.read.json(song_data, schema=song_dataset_schema) # extract columns to create songs table songs_table = df.select('song_id', 'artist_id', 'year', 'duration') # drop duplicate rows in songs table songs_table = songs_table.dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.mode('append').partitionBy( 'year', 'artist_id').parquet(output_data + "songs") # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') # drop duplicate rows in artists table artists_table = artists_table.dropDuplicates() # write artists table to parquet files artists_table.write.mode('append').parquet(output_data + "artists")
def process_song_data(spark, input_data, output_data): ''' load song data in json format from S3 bucket and process these data by extracting songs table and artists table, and save these tables back to S3 bucket :param spark: spark session :param input_data: data location for input data :param output_data: data location for output data :return: no return value ''' # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # create songs schema songSchema = R([ Fld('artist_id', Str()), Fld('artist_latitude', Dbl()), Fld('artist_location', Str()), Fld('artist_longitude', Dbl()), Fld('artist_name', Str()), Fld('duration', Dbl()), Fld('num_songs', Int()), Fld('title', Str()), Fld('year', Int()), ]) # load songs json files from S3 df_songs = spark.read.json(song_data, schema=songSchema) # select columns for songs_table songs_attr = ['title', 'artist_id', 'year', 'duration'] songs_table = df_songs.select(songs_attr)\ .dropDuplicates()\ .withColumn('song_id', monotonically_increasing_id()) # write songs_table to S3 songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # select artists columns artists_attr = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] artists_table = df_songs.select(artists_attr)\ .dropDuplicates() artists_table = artists_table\ .withColumnRenamed('artist_name','name')\ .withColumnRenamed('artist_location','location')\ .withColumnRenamed('artist_latitude','latitude')\ .withColumnRenamed('artist_longitude','longitude') # write artists_table to S3 artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ This function loads the songs JSON dataset from S3, then uses the data to create the songs and artists tables Input: spark = SparkSession object input_data = Start of path variable for input files output_data = Start of path variable for output files Output: None """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # Define schema SongSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=SongSchema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").dropduplicates() # write songs table to parquet files partitioned by year and artist output_path = os.path.join(output_data, 'songs_table.parquet') songs_table.write.partitionBy("year", "artist_id").parquet(output_path, mode="overwrite") # extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude").dropduplicates() # write artists table to parquet files output_path = os.path.join(output_data, 'artists_table.parquet') artists_table.write.parquet(output_path, mode="overwrite") #export whole songs data file to parquet output_path = os.path.join(output_data, 'songs_data_table.parquet') df.write.parquet(output_path, mode="overwrite")
def process_song_data(spark, input_data, output_data): """Process song data, transform the data into songs and artists tables and store it in parquet files on S3. Parameters ---------- spark : SparkSession cursor to the sparkify database connection input_data : string input data prepend path output_data : string output data prepend path """ # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_schema = R([ Fld("num_songs", Int()), Fld("artist_id", Str(), False), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str(), False), Fld("song_id", Str(), False), Fld("title", Str(), False), Fld("duration", Dbl(), False), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, song_schema) # extract columns to create songs table songs_table = df.select( ["song_id", "title", "artist_id", "year", "duration"]) # write songs table to parquet files partitioned by year and artist songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \ .parquet(os.path.join(output_data, 'analytics/songs')) # extract columns to create artists table artists_table = df.select([ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ]) artists_table = artists_table.withColumnRenamed("artist_name", "name") \ .withColumnRenamed("artist_location", "location") \ .withColumnRenamed("artist_latitude", "latitude") \ .withColumnRenamed("artist_longitude", "longitude") # write artists table to parquet files artists_table.write.mode("overwrite") \ .parquet(os.path.join(output_data, 'analytics/artists'))
def process_song_data(spark, input_data, output_data): print('%%%%% Starting up the SONG data process') # get filepath to song data file song_data = 'song_data/A/*/*/*.json' # setting up the schema for the data that we're about to pull songSchema = ST([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally #df = spark.read.json(input_data + song_data) raw_song_df = spark.read.json(input_data + song_data, songSchema) # extract columns to create songs table songs_table = raw_song_df.select(raw_song_df.song_id, \ raw_song_df.title, \ raw_song_df.artist_id, \ raw_song_df.year.cast(Int()), \ raw_song_df.duration.cast(Dbl())) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( 'year', 'artist_id').parquet(output_data + 'songs') print('%%%%% Songs table has been created and written to the S3 Bucket') # extract columns to create artists table artists_table = raw_song_df.select(raw_song_df.artist_id , \ raw_song_df.artist_latitude.alias('latitude'), \ raw_song_df.artist_location.alias('location'), \ raw_song_df.artist_longitude.alias('longitude'), \ raw_song_df.artist_name.alias('name')).dropDuplicates(['artist_id','name']) # write artists table to parquet files artists_table.write.mode('overwrite').parquet(output_data + 'artist') print('%%%%% Artists table has been created and written to the S3 Bucket') print('%%%%% SONG data has been completed and returning the raw_song_df') return raw_song_df
def process_song_data(spark, input_data, output_data): """ The function loads data from song_data dataset and extract columns for songs and artist tables and write the data into parquet files which will be loaded on s3. """ song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) # get filepath to song data file song_data = 'song_data/*/*/*/*.json' # read song data file df = spark.read.json(os.path.join(input_data, song_data), schema=song_schema) # extract columns to create songs table songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration').dropDuplicates() songs_table.createOrReplaceTempView('songs') # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs/songs.parquet'), 'overwrite') # extract columns to create artists table artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \ .withColumnRenamed('artist_name', 'name') \ .withColumnRenamed('artist_location', 'location') \ .withColumnRenamed('artist_latitude', 'latitude') \ .withColumnRenamed('artist_longitude', 'longitude') \ .dropDuplicates() artists_table.createOrReplaceTempView('artists') # write artists table to parquet files artists_table.write.parquet( os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def process_song_data(spark, input_data, output_data): """ Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables and then loaded the processed data back to S3 (output_data) :param spark: Spark Session object :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format """ # Get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # Read song data file print("Reading song_data JSON files from S3") df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \ columnNameOfCorruptRecord='corrupt_record').dropDuplicates() print("Read completed") # Extract columns to create songs table songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \ .withColumn("song_id", monotonically_increasing_id()) print("Writing Songs table to S3 after processing") # Write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs/", mode="overwrite", partitionBy=["year", "artist_id"]) print("Completed") # Extract columns to create artists table artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \ .dropDuplicates() print("Writing Artists table to S3 after processing") # Write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite") print("Completed")
def process_song_data(spark, input_data, output_data): """ read song data from s3 and then create the songs_table and artists_table. load them back to s3. parameters: spark: spark session input_data: path of song data output_data: path of output table """ # get filepath to song data file # song_data = input_data + "song_data/*/*/*/*.json" song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json" # create song table schema songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0 song_field = ["title", "duration", "year", "artist_id"] songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\ .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull()) # extract columns to create artists table, drop if artist_id and name containing any null values artist_field = [ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ] artists_table = df.select(artist_field).dropDuplicates().dropna( subset=["artist_id", "artist_name"]) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs/", mode="overwrite") # write artists table to parquet files artists_table.write.parquet(output_data + "artists/", mode="overwrite")
def process_song_data(spark, input_data, output_data): """ Method to process song data and create tables: songs, artists :param spark: Spark session :param input_data: S3 bucket :param output_data: S3 bucket :return: Data frame of song data """ # get filepath to song data file song_data = input_data + '/song-data/A/A/B/*.json' songs_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Int()) ]) # read song data file print('Reading song data.') df = spark.read.json(song_data, schema=songs_schema) song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration'] # extract columns to create songs table songs_table = df.selectExpr(song_columns).dropDuplicates() # write songs table to parquet files partitioned by year and artist print('Writing songs to parquet.') write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id') artist_columns = [ 'artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude' ] # extract columns to create artists table artists_table = df.selectExpr(artist_columns).dropDuplicates() # write artists table to parquet files print('Writing artists to parquet.') write_parquet(artists_table, output_data, 'artists', None, None) return df
def process_song_data(spark, input_data, output_data): """ Description: This function fetches song_data from S3 into a staging dataframe, then extracts the songs and artist tables, and eventually exports data back to S3 Parameters: spark : object for Spark Session input_data : location of song_data output_data : location of target S3 bucket """ # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # define schema songdata_schema = R([ Fld("artist_id",Str()), Fld("artist_latitude",Dbl()), Fld("artist_location",Str()), Fld("artist_longitude",Dbl()), Fld("artist_name",Str()), Fld("duration",Dbl()), Fld("num_songs",Int()), Fld("title",Str()), Fld("year",Int()), ]) # read song data file df = spark.read.json(song_data, schema=songdata_schema) # extract columns to create songs table songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration']) songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\ select(['song_id', 'artist_name', 'artist_id', 'year', 'duration']) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/') # extract columns to create artists table selection = ['artist_id', 'artist_name as name', \ 'artist_location as location', 'artist_latitude as latitude', \ 'artist_longitude as longitude'] artists_table = df.selectExpr(selection).dropDuplicates() # write artists table to parquet files artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ process_song_data - Loads the song data files from S3, and saves the song information to a parquet file (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file. """ # Get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json') songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # Read song data file df = spark.read.json(song_data, schema=songSchema) # Extract columns to create songs table songs_table = df.select( ['song_id', 'title', 'artist_id', 'year', 'duration']) # Write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").mode('overwrite').parquet( os.path.join(output_data, 'songs.parquet')) # Extract columns to create artists table, and find the distinct artists artists_table = df.select([ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ]).withColumnRenamed('artist_name', 'name').withColumnRenamed( 'artist_location', 'location').withColumnRenamed( 'artist_latitude', 'latitude').withColumnRenamed('artist_longitude', 'longitude').distinct() # Write artists table to parquet files artists_table.write.mode('overwrite').parquet( os.path.join(output_data, 'artists.parquet'))
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + "./data/song_data/*/*/*/*.json" """Creating the song_data file schema that we are going to add to spark""" songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # reading song data file json structure df = spark.read.json(song_data, schema=songSchema) """Filtering out only the needed columns for the songs table""" song_fields = ["title", "artist_id", "year", "duration"] print('Creating the songs table and dropping duplicates') songs_table = df.select(song_fields).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) print( "--- All duplicate songs have been dropped and the songs table created ---" ) print('Printing some rows from the songs_table') songs_table.show(15) print('Saving the songs table to the s3 bucket') songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + "songs") print("--- songs.parquet completed ---") """Filtering out only the needed columns for the artists table""" artists_data = [ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ] print("--- Starting to drop duplicate artists....") artists_table = df.selectExpr(artists_data).dropDuplicates() print("All duplicate artists have been dropped......") print('Printing some rows from the artists_table') artists_table.show(15) """writing the artists table to the parquets file""" artists_table.write.parquet(output_data + "artists") print("--- artists.parquet completed ---") print("*** process_song_data completed ***\n\n")
def process_song_data(spark, input_data, output_data): """ Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' and writting their parquet format on S3 Parameters: spark : Spark Session input_data : Location of song_data json files with the songs metadata output_data : S3 bucket were tables in parquet format store """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # Making right type for input json structure songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) # read song data file df = spark.read.json(song_data, schema=songSchema) # extract columns to create songs table songs_table = songs_table = df.selectExpr( "song_id", "title", "artist_id", "year", "duration").orderBy("song_id").drop_duplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy('year', 'artist_id').parquet( os.path.join(output_data, 'songs')) # extract columns to create artists table artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude").orderBy( "artist_id").drop_duplicates() # write artists table to parquet files artists_table.write.parquet(os.path.join(output_data, 'artists'))
def create_log_schema(): """ Schema structure for log data :return: StructType """ log_schema = R([ Fld('artist', St()), Fld('auth', St()), Fld('firstName', St()), Fld('gender', St()), Fld('itemInSession', LInt()), Fld('lastName', St()), Fld('length', Fl()), Fld('level', St()), Fld('location', St()), Fld('method', St()), Fld('page', St()), Fld('registration', Dbl()), Fld('sessionId', LInt()), Fld('song', St()), Fld('status', LInt()), Fld('ts', LInt()), Fld('userAgent', St()), Fld('userId', St()) ]) return log_schema
def get_log_src_schema(): """ Get the source spark schema definition :return: The schema definition """ return R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Str()), Fld("sessionId", Int()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Str()), Fld("userAgent", Str()), Fld("userId", Str()) ])
def create_log_data(): """ Create schema for log data. return: schema """ log_schema = StructType([ StructField("artist", Str()), StructField('auth', Str()), StructField('firstName', Str()), StructField('gender', Str()), StructField('itemInSession', Int()), StructField('lastName', Str()), StructField('length', Dbl()), StructField('level', Str()), StructField('location', Str()), StructField('method', Str()), StructField('page', Str()), StructField('registration', Dec()), StructField('sessionId', Int()), StructField('song', Str()), StructField('status', Int()), StructField('ts', Long()), StructField('userAgent', Str()), StructField('userId', Int()) ]) return log_schema
def process_song_data(spark, input_data_songs, output_data): """ Read song data by providing it an expected schema. Create songs and artists tables. """ # define song data schema to improve performance song_schema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Long()), Fld("song_id", Str()), Fld("title", Str()), Fld("year", Long()) ]) song_data = input_data_songs t_start = time() dfSongs = spark.read.json(song_data, schema=song_schema) t_end = time() - t_start print('Read song data in {} secs'.format(t_end)) dfSongs.printSchema() dfSongs.count() dfSongs.show(5) songs_table = dfSongs.filter(dfSongs.song_id != '')\ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) songs_table.show(5) songs_table.write.partitionBy( "year", "artist_id").mode('overwrite').parquet(output_data + 'songs/songs_table.parquet') artists_table = dfSongs.filter(dfSongs.artist_id !='') \ .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"), col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\ .dropDuplicates() artists_table.show(5) artists_table.write.mode('overwrite').parquet( output_data + 'artists/artists_table.parquet')
def process_song_data(spark, input_data, output_data): """ Reads from song files, transforms them into songs and artists data, and writes them in parquet format. params: - spark: spark session object - input_data: input data path - output_data: output data path """ # get filepath to song data file song_data = input_data + "/song_data/*/*/*/*.json" # use schema when read json files song_schema = St([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=song_schema) # extract columns to create songs table songs_table = df.select("song_id", "title", "artist_id", "year", "duration").dropDuplicates() # write songs table to parquet files partitioned by year and artist songs_table.write.parquet(output_data + "songs", mode="overwrite", \ partitionBy=["year", "artist_id"]) # extract columns to create artists table artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \ "artist_latitude as latitude", "artist_longitude as longitude") \ .dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + "artists", mode="overwrite")
def get_song_src_schema(): """ Get the source spark schema definition :return: The schema definition """ return R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()) ])
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = input_data + '*/*/*/*.json' # creating schema fo our song_data songs_model = St([ Sfld("num_songs", Int()), Sfld("artist_id", Str()), Sfld("artist_latitude", Dbl()), Sfld("artist_longitude", Dbl()), Sfld("artist_location", Str()), Sfld("artist_name", Str()), Sfld("song_id", Str()), Sfld("title", Str()), Sfld("duration", Dbl()), Sfld("year", Int()) ]) # read song data file df = spark.read.json(song_data, schema=songs_model) # extract columns to create songs table songs_table = df.select(["title", "artist_id", "year", "duration"]).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet( output_data + 'songs_table/', mode='overwrite') # extract columns to create artists table artists_fields = [ "artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude" ] artists_table = df.select(artists_fields).withColumnRenamed( 'artist_name', 'name').withColumnRenamed( 'artist_location', 'location').withColumnRenamed( 'artist_latitude', 'latitude').withColumnRenamed('artist_longitude', 'longitude').dropDuplicates() # write artists table to parquet files artists_table.write.parquet(output_data + 'artists_table/', mode='overwrite')
def process_song_data(spark, input_data, output_data): """ Extract data from song_data and write songs and artists table Arguments: - spark : SparkSession object - input_data : input data root dir path - output_data : output data root dir path """ # schema for song_data songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_longitude", Dbl()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year",Int()) ]) # get filepath to song data file song_data = input_data + "song_data/*/*/*" # read song data file df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"]) # extract columns to create songs table df.createOrReplaceTempView("song_data") songs_table = spark.sql(""" SELECT song_id, title, artist_id, year, duration FROM song_data """) # write songs table to parquet files partitioned by year and artist songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite') # extract columns to create artists table artists_table = spark.sql(""" SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude FROM song_data """).dropDuplicates(["artist_id"]) # write artists table to parquet files artists_table.write.parquet(output_data + "artists_table")
def process_song_data(spark, input_data, output_data): """ The funtion process song data using spark on AWS. Input: song_log in .json format, output: Processed data in parquet format loaded back to S3 args: Spark session, input_data, output_data Return: none """ song_data = input_data + 'song_data/*/*/*/*.json' songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) df = spark.read.json(song_data, schema=songSchema) song_fields = ["title", "artist_id", "year", "duration"] songs_table = df.select(song_fields).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') artists_fields = [ "artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude" ] artists_table = df.selectExpr(artists_fields).dropDuplicates() artists_table.write.parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ Description: Function that processes the raw data from the S3 bucket Parameters: :spark: uses the earlier instantiated spark session :input_data: path of the location where the files are residing :output_data: path of the location where the files will be saved after processing :return: none """ song_data = input_data + 'song_data/A/A/A/*.json' songSchema = R([ Fld("artist_id", Str()), Fld("artist_latitude", Dbl()), Fld("artist_location", Str()), Fld("artist_longitude", Dbl()), Fld("artist_name", Str()), Fld("duration", Dbl()), Fld("num_songs", Int()), Fld("title", Str()), Fld("year", Int()), ]) df = spark.read.json(song_data, schema=songSchema) song_fields = ["title", "artist_id", "year", "duration"] songs_table = df.select(song_fields).dropDuplicates().withColumn( "song_id", monotonically_increasing_id()) songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_data + 'songs/') artists_fields = [ "artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude" ] artists_table = df.selectExpr(artists_fields).dropDuplicates() artists_table.write.mode('overwrite').parquet(output_data + 'artists/')
def process_song_data(spark, input_data, output_data): """ This function processes the song data of sparkify and creates facts/dimensions via spark and saves them to our data lake afterwards Arguments: spark {SparkSession}: Spark session to launch the program input_data {str}: location (local/s3) where the (root) input song data resides output_data {str}: location (local/s3) where the (root) output files should be written """ # get filepath to song data file # song_data = f"{input_data}song_data/A/A/A/*.json" song_data = f"{input_data}song_data/*/*/*/*.json" # read song data file songSchema = R([ Fld("num_songs", Int()), Fld("artist_id", Str()), Fld("artist_latitude", Str()), Fld("artist_longitude", Str()), Fld("artist_location", Str()), Fld("artist_name", Str()), Fld("song_id", Str()), Fld("title", Str()), Fld("duration", Dbl()), Fld("year", Int()), ]) # since schema can not infered automatically, we need to specify it beforehand df_song = spark.read.json(song_data, schema=songSchema) df_song.cache() # extract columns to create songs table songs_table = df_song.filter(df_song.song_id != '') \ .select(['song_id', 'title', 'artist_id', 'year', 'duration']) \ .dropDuplicates(['song_id']) # write songs table to parquet files partitioned by year and artist output_song_data = f"{output_data}song_data/" songs_table.write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_song_data) # extract columns to create artists table artists_table = df_song.filter(df_song.artist_id != '') \ .selectExpr(['artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude']) \ .dropDuplicates(['artist_id']) # write artists table to parquet files output_artist_data = f"{output_data}artist_data/" artists_table.write.mode('overwrite').parquet(output_artist_data)