def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = "s3a://udacity-dend/log_data/*/*/*.json" # read log data file df_log = spark.read.json(log_data) # filter by actions for song plays df_log = df_log.where(df_log.page == "NextSong") # extract columns for users table users_table = df_log.select( col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level") .dropDuplicates() # write users table to parquet files users_table.write.format("parquet").save("s3a://udacity-dend/users") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S')) df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d')) df_log = df_log.withColumn("datetime", get_datetime(df_log.ts)) # extract columns to create time table time_table = df_log.select( df_log.timestamp.alias('start_time'), hour(df_log.datetime).alias('hour'), dayofmonth(df_log.datetime).alias('day'), weekofyear(df_log.datetime).alias('week'), month(df_log.datetime).alias('month'), year(df_log.datetime).alias('year'), date_format(df_log.datetime, 'u').alias('weekday')) .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month"). format("parquet").save("s3a://udacity-dend/time") # read in song data to use for songplays table song_df = log_song # extract columns from joined song and log datasets to create songplays table cond = [df_song.title == df_log.song, df_song.artist_name == df_log.artist] df = df_log.join(df_song, cond, 'outer') .withColumn("songplay_id", monotonically_increasing_id()) songplays_table = df.select( df.songplay_id, df.timestamp.alias('start_time'), df.userId.alias('user_id'), df.level, df.song_id, df.artist_id, df.sessionId.alias('session_Id'), df.location, df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month") .format("parquet").save("s3a://udacity-dend/songplays")
def process_log_data(spark, input_data, output_data): ''' Process the log data from s3 bucket to the dataframes Arguments : spark - session variable log_data_local - logdata location in s3 Bucket output_data - store the output parquet files ''' logdata_path = os.path.join(input_data, "log-data/*/*/*.json") df_logdata = spark.read.json(logdata_path) df_logdata.printSchema() df_logdata_filter = df_logdata.filter(df_logdata.page=='NextSong') #Process user table users_df = df_logdata_filter['userId', 'firstName', 'lastName', 'gender', 'level'] users_df = users_df.dropDuplicates(['userId']) users_df.printSchema() users_df.show(5, truncate= False) users_output = output_data + "users.parquet" users_df.write.mode("overwrite").parquet(users_output) #Process time table #create user defined functions to convert 'ts' @udf(t.TimestampType()) def get_timestamp(ts): return datetime.fromtimestamp(ts/1000.0) df_logdata_filter = df_logdata_filter.withColumn('timestamp', get_timestamp("ts")) @udf(t.StringType()) def get_datetime(ts): return datetime.fromtimestamp(ts/1000.0).strftime('%Y-%m-%d %H:%M:%S') df_logdata_filter = df_logdata_filter.withColumn('datetime',get_datetime("ts")) df_logdata_filter.printSchema() df_logdata_filter.show(5truncate= False) # extract columns to create time table time_df = df_logdata_filter.select( col('datetime').alias('start_time') ,hour('datetime').alias('hour') ,dayofmonth('datetime').alias('day') ,weekofyear('datetime').alias('week') ,month('datetime').alias('month') ,year('datetime').alias('year') ) time_df = time_df.dropDuplicates(['start_time']) time_df.printSchema() time_df.show(5, truncate=False) time_output = output_data + "time.parquet" time_df.write.mode("overwrite").partitionBy("year","month").parquet(time_output) songdata_path = os.path.join(input_data, "song-data/A/A/A/*.json") df_songdata = spark.read.json(songdata_path) df_logdata_filter = df_logdata_filter.join(df_songdata, df_songdata.title == df_logdata_filter.song ) songplays_df = df_logdata_filter.select( col('ts').alias('ts') ,col('userId').alias('user_id') ,col('level').alias('level') ,col('song_id').alias('song_id') ,col('artist_id').alias('artist_id') ,col('ssessionId').alias('session_id') ,col('location').alias('location') ,col('userAgent').alias('user_agent') ,col('year').alias('year') ,month('datetime').alias('month')) songplays_df = songplays_df.selectExpr("ts as start_time") songplays_df.select(monotonically_increasing_id().alias('songplay_id')).collect() songplays_output = output_data + "songplays.parquet" songplays_df.write.mode("overwrite").partitionBy("year","month").parquet(songplays_output)
def process_log_data(spark, input_data, output_data): """ Description: Process the event log file and extract data for table time, users and songplays from it. :param spark: a spark session instance :param input_data: input file path :param output_data: output file path """ # get filepath to log data file log_data = input_data + "log-data/*" # read log data file df = spark.read.json( log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record').drop_duplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, "users/"), mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table time_table = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time"))\ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.parquet(os.path.join(output_data, "time_table/"), mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read\ .format("parquet")\ .option("basePath", os.path.join(output_data, "songs/"))\ .load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.drop_duplicates().write.parquet( os.path.join(output_data, "songplays/"), mode="overwrite", partitionBy=["year", "month"])
def process_log_data(spark, input_data, output_data): """ Process Log Data procedure ETL process read and load song json file and extract users, time and songplays data and save as *.parquet file to either project workspace or to an S3 bucket. Also, add logging information to track the ETL process """ loadTimes = [] print('======= Read log data json files to dfLog dataframe =======') log_data = input_data + "log_data/2018/11" t0 = time() print('Path: ' + log_data) print('dfLog = spark.read.json(log_data)') dfLog = spark.read.json(log_data) cnt = dfLog.count() print('Total count of log data: ' + str(cnt)) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) print('======= Read song data json files to dfSong dataframe =======') song_data = input_data + "song_data/*/*/*" dataSong = input_data + "/song_data/A/A/A/" print('Path: ' + song_data) print("Load schema a song file") print( "dfGetSampleSchema = spark.read.options(samplingRatio=0.1).json(dataSong).schema" ) loadTimes = [] t0 = time() dfGetSampleSchema = spark.read.options( samplingRatio=0.1).json(dataSong).schema songschema = dfGetSampleSchema print('dfSong = spark.read.json(song_data, schema=songschema) ') dfSong = spark.read.json(song_data, schema=songschema) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) # extract columns for users data and drop duplicate userId print('======= Users: Extract fields and drop duplicates data =======') print('dfLog.select("userId","firstName", "lastName", "gender", "level")') print('') loadTimes = [] t0 = time() users_table = dfLog.select("userId", "firstName", "lastName", "gender", "level") users_table = users_table.dropDuplicates(['userId']) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) # create users parquet file(s) print('======= Users: Create users parquet =======') print('users_table.write.mode(overwrite).parquet(users_parguet)') loadTimes = [] t0 = time() users_parguet = output_data + "users.parquet" users_table.write.mode('overwrite').parquet(users_parguet) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) # create timestamp/datetime column and extract columns from original timestamp column print( '======= Time: Create Time table from ts column and drop duplicates data =======' ) print( 'time_table.withColumn(datetime, from_unixtime((time_table.ts/1000) .........' ) loadTimes = [] t0 = time() time_table = dfLog.select("ts") time_table = time_table.withColumn('datetime', from_unixtime((time_table.ts/1000),'yyyy-MM-dd HH:mm:ss.SSSS')) .\ withColumn('hour', hour('datetime')) .\ withColumn('day', dayofmonth('datetime')) .\ withColumn('week', weekofyear('datetime')) .\ withColumn('month', month('datetime')) .\ withColumn('year', year('datetime')) .\ withColumn('weekday', dayofweek('datetime')) .\ withColumnRenamed('ts','milliseconds') .\ withColumn('datetime', F.to_timestamp('datetime')) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) # Create Time parquet and partition by year and month print('======= Time: Create time parquet =======') print( 'time_table.write.mode(overwrite).partitionBy("year","month").parquet(time_parquet)' ) loadTimes = [] t0 = time() time_parquet = output_data + "time.parquet" time_table.write.mode('overwrite').partitionBy( "year", "month").parquet(time_parquet) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) # extract columns from song and log json to create songplays print('======= SongPlays: Create SongPlays Time =======') print('Join logfiles and songfiles data to create the SongPlays dataset') print( 'Create temp views to be used in sql statment to load songplays dataframe' ) loadTimes = [] t0 = time() #Create temp views to be used in songplays query needed to created parquet file print('>>> dfLog.createOrReplaceTempView(Log) ....') dfLog.createOrReplaceTempView("Log") print('>>> dfSong.createOrReplaceTempView(Songs) ....') dfSong.createOrReplaceTempView("Songs") print('>>> dfTimeTable.createOrReplaceTempView("Time") .....') time_table.createOrReplaceTempView("Time") print('Load dataframe songplays_table based on sql statement ') print('spark.sql(select t.year, t.month, datetime start_time,......)') #Use spark sql to create the necessary dataset to load songplays table/parquet songplays_table = spark.sql(""" select t.year, t.month, datetime start_time, userid, level, s.song_id, s.artist_id, sessionId, location, userAgent from Log l inner join Time t on l.ts = t.milliseconds left join Songs s on s.artist_name = l.artist and s.title = l.song and s.duration = l.length """) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime)) #print("Remove duplicates") #print("songplays_table = songplays_table.dropDuplicates(['userid','level','song_id','artist_id','sessionId'])") #songplays_table = songplays_table.dropDuplicates(['userid','level','song_id','artist_id','sessionId']) print("Add unique index id name called songplays_id") print( "songplays_table.withColumn('songplays_id',monotonically_increasing_id() +1)" ) songplays_table = songplays_table.withColumn( "songplays_id", monotonically_increasing_id() + 1) # write songplays table to parquet files partitioned by year and month print('======= SongPlays: Create SongPlays parquet =======') print( 'songplays_table.write.mode(overwrite).partitionBy("year","month").parquet(songplays_parquet)' ) loadTimes = [] t0 = time() songplays_parquet = output_data + "songplays.parquet" songplays_table.write.mode('overwrite').partitionBy( "year", "month").parquet(songplays_parquet) loadTime = time() - t0 loadTimes.append(loadTime) print("=== DONE IN: {0:.2f} sec\n".format(loadTime))
sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) glueContext = GlueContext(SparkContext.getOrCreate()) df = spark.read.format("com.databricks.spark.csv").schema(customSchema).option( "quote", '"').option("header", "true").option("delimiter", ',').load('s3://fsi406-xetra-${user}/*/*.csv') df1 = df.withColumn("Timestamp", to_timestamp(mergeCols(("Date"), ("Time")))) df2 = df1.drop("Date", "Time") df3 = df2.withColumn("Year", year("Timestamp")).withColumn( "Month", month("Timestamp")).withColumn("Day", dayofmonth("Timestamp")) dynaframe = DynamicFrame.fromDF(df3, glueContext, "xetra") glueContext.write_dynamic_frame.from_options( frame=dynaframe, connection_type="s3", connection_options={ "path": "s3://fsi406-parquet-${user}/", "partitionKeys": ["year", "month", "day"], "mode": "overwrite" }, format="parquet")
def process_log_data(spark, input_data, output_data): """ Load data from log_data dataset and extract columns for users and time tables, reads both the log_data and song_data datasets and extracts columns for songplays table with the data. It writes the data into parquet files which will be loaded on s3. Parameters ---------- spark: session This is the spark session that has been created input_data: path This is the path to the log_data s3 bucket. output_data: path This is the path to where the parquet files will be written. """ # get filepath to log data file log_data = input_data + 'log_data/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays actions_df = df.filter(df.page == 'NextSong') \ .select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location', 'userAgent') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() users_table.createOrReplaceTempView('users') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) actions_df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts)) # extract columns to create time table time_table = actions_df.select('datetime') \ .withColumn('start_time', actions_df.datetime) \ .withColumn('hour', hour('datetime')) \ .withColumn('day', dayofmonth('datetime')) \ .withColumn('week', weekofyear('datetime')) \ .withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .withColumn('weekday', dayofweek('datetime')) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month') \ .parquet(os.path.join(output_data, 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json') # extract columns from joined song and log datasets to create songplays table actions_df = actions_df.alias('log_df') song_df = song_df.alias('song_df') joined_df = actions_df.join( song_df, col('log_df.artist') == col('song_df.artist_name'), 'inner') songplays_table = joined_df.select( col('log_df.datetime').alias('start_time'), col('log_df.userId').alias('user_id'), col('log_df.level').alias('level'), col('song_df.song_id').alias('song_id'), col('song_df.artist_id').alias('artist_id'), col('log_df.sessionId').alias('session_id'), col('log_df.location').alias('location'), col('log_df.userAgent').alias('user_agent'), year('log_df.datetime').alias('year'), month('log_df.datetime').alias('month')) \ .withColumn('songplay_id', monotonically_increasing_id()) songplays_table.createOrReplaceTempView('songplays') # write songplays table to parquet files partitioned by year and month time_table = time_table.alias('timetable') songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created''' df_nycflights = df_nycflights. \ drop('year'). \ drop('month'). \ drop('day'). \ drop('hour'). \ drop('minute'). \ drop('date'). \ drop('time') df_nycflights.show() '''Now we extract the fields back''' df_nycflights = df_nycflights. \ withColumn('year',year(df_nycflights.timestamp)). \ withColumn('month',month(df_nycflights.timestamp)). \ withColumn('day',dayofmonth(df_nycflights.timestamp)). \ withColumn('hour',hour(df_nycflights.timestamp)). \ withColumn('minute',minute(df_nycflights.timestamp)) df_nycflights.show() '''Now few operations on timestamp ''' df_nycflights = df_nycflights.\ withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \ withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \ withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp)) df_nycflights.show()
def process_log_data(spark, input_data, output_data): """ Processing log data (users, time table, songplay) by the JSON given by S3, after data normalization and transformation these data are wrote as parquet files """ """ Proving JSON structure to Spark """ logdata_schema = StructType([ StructField("artist", StringType(), True), StructField("auth", StringType(), True), StructField("firstName", StringType(), True), StructField("gender", StringType(), True), StructField("itemInSession", LongType(), True), StructField("lastName", StringType(), True), StructField("length", DoubleType(), True), StructField("level", StringType(), True), StructField("location", StringType(), True), StructField("method", StringType(), True), StructField("page", StringType(), True), StructField("registration", DoubleType(), True), StructField("sessionId", LongType(), True), StructField("song", StringType(), True), StructField("status", LongType(), True), StructField("ts", LongType(), True), StructField("userAgent", StringType(), True), StructField("userId", StringType(), True), ]) # get filepath to log data file log_data = input_data + 'log-data' # read log data file, JSON structure df = spark.read.json(log_data, schema = logdata_schema) # filter by actions for song plays df = df.filter(col("page") == 'NextSong') # extract columns for users table users_table = df.select(col("userId").alias("user_id"),col("firstName").alias("first_name"), col("lastName").alias("last_name"),"gender","level") # write users table to parquet files users_table.write.parquet(output_data+"users") tsFormat = "yyyy-MM-dd HH:MM:ss z" # Converting ts to a timestamp format time_table = df.withColumn('ts', to_timestamp(date_format((df.ts /1000).cast(dataType=TimestampType()), tsFormat), tsFormat)) # extract columns to create time table time_table = time_table.select(col("ts").alias("start_time"), hour(col("ts")).alias("hour"), dayofmonth(col("ts")).alias("day"), weekofyear(col("ts")).alias("week"), month(col("ts")).alias("month"), year(col("ts")).alias("year")) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year","month").parquet(output_data+"time") # read in song data to use for songplays table song_data = input_data+"song-data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, song_df.artist_name==df.artist). withColumn("songplay_id", monotonically_increasing_id()). withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat)). select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year","month").parquet(output_data+"songplays")
def process_log_data(spark, input_data, output_data): """ Function to process User, Time and Song Plays data from the json files under the log-data folder and load it in parquet format on a public S3 bucket. """ # get filepath to log data file log_data = input_data + 'log-data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col("page") == 'NextSong') # extract columns for users table users_table = df.select( col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level") # write users table to parquet files users_table.write.parquet(output_data + "users") # define ts format tsFormat = "yyyy-MM-dd HH:MM:ss z" # convert ts to a timestamp format time_table = df.withColumn( 'ts', to_timestamp( date_format((df.ts / 1000).cast(dataType=TimestampType()), tsFormat), tsFormat)) # extract columns to create time table time_table = time_table.select( col("ts").alias("start_time"), hour(col("ts")).alias("hour"), dayofmonth(col("ts")).alias("day"), weekofyear(col("ts")).alias("week"), month(col("ts")).alias("month"), year(col("ts")).alias("year")) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_data = input_data + "song-data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join( df, song_df.artist_name == df.artist).withColumn( "songplay_id", monotonically_increasing_id()).withColumn( 'start_time', to_timestamp( date_format( (col("ts") / 1000).cast(dataType=TimestampType()), tsFormat), tsFormat)).select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + "songplays")
def date_features(df, timestamp_column): import pyspark.sql.functions as F df = df.withColumn('date', F.to_date( F.col(timestamp_column), 'yyyy-mm-dd')).withColumn( 'day_of_week', F.date_format(F.col(timestamp_column), 'u')).withColumn( 'month', F.month('date')).withColumn('day_of_month', F.dayofmonth('date')).withColumn( 'week_of_year', F.weekofyear('date')) df = df.withColumn( 'weekend', F.when(F.col('day_of_week') == 1, 'weekend').when(F.col('day_of_week') == 6, 'weekend').when( F.col('day_of_week') == 7, 'weekend').otherwise('Weekday')) df = df.withColumn( 'day', F.when(F.col('day_of_week') == 7, 'Saturday').when( F.col('day_of_week') == 2, 'Monday').when(F.col('day_of_week') == 3, 'Tuesday').when( F.col('day_of_week') == 4, 'Wednesday').when(F.col('day_of_week') == 5, 'Thursday').when( F.col('day_of_week') == 6, 'Friday').otherwise('Sunday')) df = df.withColumn( 'month_end', F.when(F.col('day_of_month') == 25, 'month_end').when( F.col('day_of_month') == 26, 'month_end').when(F.col('day_of_month') == 27, 'month_end').when( F.col('day_of_month') == 28, 'month_end').when(F.col('day_of_month') == 29, 'month_end').when( F.col('day_of_month') == 30, 'month_end').when( F.col('day_of_month') == 31, 'month_end'). when(F.col('day_of_month') == 1, 'month_end').when( F.col('day_of_month') == 2, 'month_end').when(F.col('day_of_month') == 3, 'month_end').when( F.col('day_of_month') == 4, 'month_end').when(F.col('day_of_month') == 5, 'month_end').otherwise('not_month_end')) df = df.withColumn( 'christmas', F.when( ((F.col('month') == 12) & (F.col('day_of_month') == 20)), 'christmas').when( ((F.col('month') == 12) & (F.col('day_of_month') == 21)), 'christmas').when( ((F.col('month') == 12) & (F.col('day_of_month') == 22)), 'christmas').when( ((F.col('month') == 12) & (F.col('day_of_month') == 23)), 'christmas').when( ((F.col('month') == 12) & (F.col('day_of_month') == 24)), 'christmas').when( ((F.col('month') == 12) & (F.col('day_of_month') == 25)), 'christmas').otherwise('not_christmas')) return df
# MAGIC # MAGIC **`daily_hosts_df`** # MAGIC # MAGIC A DataFrame with two columns: # MAGIC # MAGIC | column | explanation | # MAGIC | ------- | -------------------------------------------------- | # MAGIC | `day` | the day of the month | # MAGIC | `count` | the number of unique requesting hosts for that day | # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import dayofmonth day_to_host_pair_df = logs_df.select(logs_df.host, dayofmonth(logs_df.time).alias('day')) day_group_hosts_df = day_to_host_pair_df.distinct() daily_hosts_df = day_group_hosts_df.groupBy('day').count() daily_hosts_df.cache() print 'Unique hosts per day:' daily_hosts_df.show(30, False) # COMMAND ---------- # TEST Number of unique daily hosts (4c) daily_hosts_list = (daily_hosts_df .map(lambda r: (r[0], r[1])) .take(30)) Test.assertEquals(day_to_host_pair_df.count(), total_log_entries, 'incorrect row count for day_to_host_pair_df')
#print not200DF not200DF.show(10) # Sorted DataFrame containing all paths and the number of times they were accessed with non-200 return code logs_sum_df = not200DF.groupBy('path').count().sort('count',ascending=False) print 'Top Ten failed URLs:' logs_sum_df.show(10, False) #Find Number of Unique Hosts unique_host_count = logs_df.select('host').distinct().count() print 'Unique hosts: {0}'.format(unique_host_count) #Number of Unique Daily Hosts from pyspark.sql.functions import dayofmonth day_to_host_pair_df = logs_df.select('host',dayofmonth('time').alias('day')) #day_to_host_pair_df.show(10,False) day_group_hosts_df = day_to_host_pair_df.dropDuplicates() #day_group_hosts_df.show(10,False) daily_hosts_df = day_group_hosts_df.groupBy('day').count() print 'Unique hosts per day:' daily_hosts_df.show(30, False) daily_hosts_df.cache() #Visualizing the Number of Unique Daily Hosts days_with_hosts =[] hosts = [] for i in range (len(daily_hosts_df.collect())): days_with_hosts.append(daily_hosts_df.collect()[i][0]) hosts.append(daily_hosts_df.collect()[i][1])
file3 = file3.withColumn('Age', split_col3.getItem(2)) file3 = file3.withColumn('Occupation', split_col3.getItem(3)) file3 = file3.withColumn('Zip-code', split_col3.getItem(4)).drop('_c0') #file3.show() #file4 = file1.join(file2, file1.MovieID == file2.MovieID,"inner") #alternate of it. file4 = file1.join(file2, ['MovieID'], "inner") file5 = file4.na.drop() split_date = f.split(file5['Timestamp'], ' ') file6 = file5.withColumn('Date', split_date.getItem(0).cast('date')).withColumn( 'Time', split_date.getItem(1)) file6 = file6.withColumn('Year', f.year(f.col('Date'))).withColumn( 'month', f.month(f.col('Date'))).withColumn( 'day', f.dayofmonth(f.col('Date'))).drop('Timestamp') #print(file5.count()) #print(file4.count()) #......module1 f1 = file5.groupBy( 'MovieID', 'Rating', 'Title').count() #how many times which movie get what rating #for i in f1.take(10): # print(i) #f1.orderBy(f.desc('Rating'),f.desc('count')).show(10,truncate=False) f1 = f1.orderBy(f.desc('Rating'), f.desc('count')).limit(10).toPandas() print('converted into pandas') #f2=f1.orderBy(f.desc('Rating'),f.desc('count')).toPandas() plt.rcParams["figure.figsize"] = [30, 25] f1.plot(x="Title", y='count', kind="bar")
def process_log_data(spark, input_data, output_data): """ Description: This function is used to read the log data in the filepath (bucket/log_data) to get the information needed to populate the dimensional tables (user, time and song) as well as the songplays fact table. Parameters: spark: the cursor object. input_path: path to the bucket containing song data. output_path: path to destination bucket where parquet files will be saved. Returns: None """ print('Begin processing log data') # get filepath to log data file # log_data = input_data + 'log-data/*/*/*.json' log_data = input_data + 'log-data/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.select( ["userId", "firstName", 'lastName', 'location', 'gender']).dropDuplicates() print('Save users table') # write users table to parquet files users_table.write.save(output_data + 'users_table', format='parquet', mode='append') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000.0) df = df.withColumn("timestamp", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) df = df.withColumn("date_sp", get_datetime("timestamp")) # extract columns to create time table time_table = df.withColumn("hour", hour(df.date_sp)) \ .withColumn("year", year(df.date_sp)) \ .withColumn("day", dayofmonth(df.date_sp)) \ .withColumn("week", weekofyear(df.date_sp)) \ .withColumn("month", month(df.date_sp)) \ .withColumn("weekday", dayofweek(df.date_sp)) \ .withColumnRenamed('ts', 'start_time') \ .select(['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']) \ .dropDuplicates() print('Save time table') # write time table to parquet files partitioned by year and month time_table.repartition("year", "month").write.mode("append").partitionBy( "year", "month").parquet(output_data + 'time_table') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song-data/*/*/*/*.json') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, [df.song == song_df.title, df.length == song_df.duration, df.artist == song_df.artist_name]) \ .select(df.ts, df.userId, df.level, song_df.song_id, song_df.artist_id, df.sessionId, df.location, df.userAgent, df.date_sp) \ .withColumn("year", year(df.date_sp)) \ .withColumn("month", month(df.date_sp)) print('Save songplays table - Fact Table') # write songplays table to parquet files partitioned by year and month songplays_table.repartition("year", "month").write.mode("append").partitionBy( "year", "month").parquet(output_data + 'songplays_table') print('Completed.')
def process_log_data(spark, input_data, output_data): ''' Description: Processes the log data stored in JSON-files via Spark and stores them in parquet files in an S3 container Parameters: spark (handle): handle to Spark Session input_data (string): path to Input directory on S3 output_data (string): path to Output directory on S3 Returns: - ''' # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') #log_data= os.path.join(input_data, "*.json") #debug using local files # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').distinct() # write users table to parquet files users_table.write.parquet(f'{output_data}/users_table', mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: to_date(x), TimestampType()) df = df.withColumn('datetime', get_datetime(df.ts)) # extract columns to create time table time_table = df.select("ts","timestamp").drop_duplicates() \ .withColumn("hour", hour(col('timestamp'))) \ .withColumn("day", dayofmonth(col('timestamp'))) \ .withColumn("week", weekofyear(col('timestamp'))) \ .withColumn("month", month(col('timestamp'))) \ .withColumn("year", year(col('timestamp'))) \ .withColumn("weekday", date_format(col('timestamp'),'E')) # write time table to parquet files partitioned by year and month time_table.write.parquet(f'{output_data}/time_table', mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read.parquet(f'{output_data}/songs_table') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"),"ts",col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) \ .join(time_table, df.ts == time_table.ts, how="inner")\ .select("songplay_id", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") \ .drop_duplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(f'{output_data}/songplays_table', mode='overwrite', partitionBy=["year", "month"])
taxi_year = udf(taxi_year) ## Yellow Taxi Data 2011 ## read data and register as sql df taxi_2011 = spark.read.format("csv").options(header="true",\ inferschema="true").load(sys.argv[1]) taxi_2011.createOrReplaceTempView("taxi_2011") ## breakdown timestamp taxi_2011 = taxi_2011.withColumn("tripyear", year(taxi_2011.Trip_Pickup_DateTime)) taxi_2011 = taxi_2011.withColumn("tripmonth", month(taxi_2011.Trip_Pickup_DateTime)) taxi_2011 = taxi_2011.withColumn("tripday", dayofmonth(taxi_2011.Trip_Pickup_DateTime)) taxi_2011.createOrReplaceTempView("taxi_2011") ## group by day taxi_daygroups2011 = spark.sql("SELECT * FROM taxi_2011").\ groupby("tripyear","tripmonth","tripday").count() taxi_daygroups2011.createOrReplaceTempView("taxi_daygroups2011") ## sort by day taxi_daygroups2011 = spark.sql("SELECT * FROM taxi_daygroups2011").\ orderBy("tripyear","tripmonth","tripday") taxi_daygroups2011.createOrReplaceTempView("taxi_daygroups2011") ## Yellow Taxi Data 2012 ## read data and register as sql df taxi_2012 = spark.read.format("csv").options(header="true",\
def process_log_data(spark, input_data, output_data): """Function to read source log files from S3 and output parquet files for users, time and songplays back on S3 Args: spark: to read files with spark input_data: source location for S3 Bucket output_data: destination location for S3 Bucket Output Files: s3://output-datalakes/users/users.parquet s3://output-datalakes/time/time.parquet s3://output-datalakes/songplays/songplays.parquet """ # get filepath to log data file log_data =os.path.join(input_data,"log_data/") # read log data file df = spark.read.json(log_data) # filter by actions for song plays and also create songplay_id incremental key df = df.filter(df['page'] == "NextSong").withColumn('songplay_id', monotonically_increasing_id()) # extract columns for users table users_table = df.select('userid', \ 'firstName', \ 'lastName', \ 'gender', \ 'level').dropDuplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data,"users/","users.parquet")) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: int(x/1000)) df = df.withColumn('start_time',from_unixtime(get_timestamp(df['ts']))) # extract columns to create time table time_table = df.select('start_time') \ .withColumn('hour',hour(df['start_time'])) \ .withColumn('day',dayofmonth(df['start_time'])) \ .withColumn('week',weekofyear(df['start_time'])) \ .withColumn('month',month(df['start_time'])) \ .withColumn('year',year(df['start_time'])) \ .withColumn('weekday',dayofweek(df['start_time'])) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year","month").parquet(os.path.join(output_data,"time/","time.parquet")) # read in song data to use for songplays table song_df = spark.read.json(os.path.join(input_data, "song_data/A/A/A")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df,(df.song == song_df.title) & (df.artist == song_df.artist_name),'left_outer') \ .select(df.songplay_id, \ df.start_time, \ df.userId, \ df.level, \ song_df.song_id, \ song_df.artist_id, \ df.sessionId, \ df.location, \ df.userAgent) \ .dropDuplicates() songplays_table.show() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("start_time").parquet(os.path.join(output_data,"songplays/","songplays.parquet"))
def process_log_data(spark, input_data, output_data): """ Loads log data from S3 and transform them into users, time and songplays table, and write them on the sparkify S3 Arguments: spark {object}: spark session input_data {string}: a filepath to S3 where contains log data output_data {string}: a filepath to sparkify S3 """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = df = spark.read.json(log_data) # filter by actions for song plays df = df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.filter(df.userId != '').selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level") \ .dropDuplicates() # output filepath to users table file users_table_path = output_data + "users_table.parquet" # write users table to parquet files users_table.write.mode("overwrite") \ .parquet(users_table_path) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("start_time", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select( "start_time", hour("start_time").alias("hour"), dayofmonth("datetime").alias("day"), weekofyear("datetime").alias("week"), month("datetime").alias("month"), year("datetime").alias("year"), dayofweek("datetime").alias("weekday")).dropDuplicates() # output filepath to time table time_table_path = output_data + "time_table.parquet" # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").mode("overwrite") \ .parquet(time_table_path) # get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" # read in song data to use for songplays table song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, (df.song == song_df.title) & (df.length == song_df.duration) & (df.artist == song_df.artist_name), how='left').dropDuplicates() songplays_table = songplays_table.withColumn("id", monotonically_increasing_id()) windowSpec = Window.orderBy("id") songplays_table.withColumn("songplay_id", row_number().over(windowSpec)) songplays_table = songplays_table.selectExpr( "songplay_id", "start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionId as session_id", "location", "userAgent as user_agent", "year(start_time) as year", "month(start_time) as month") # output filepath to songplays table songplays_table_path = output_data + "songplays_table.parquet" # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").mode("overwrite") \ .parquet(songplays_table_path)
def process_log_data(spark, input_data, output_data, songs_data): """ Process log_data json files which located in S3 Create table users, time and song_plays songs_data will be needed in creation song_plays table Store the table in parque format in S3 Args: spark : Spark Session input_data (string) : location json files (input) output_data (string) : location parque files (output) songs_data (Spark Dataframe) : Song Data tables Returns: None """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # set schema log data logSchema = StructType([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("itemInSession", Int()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Int()), Fld("song", Str()), Fld("status", Int()), Fld("ts", Long()), Fld("userAgent", Str()), Fld("userId", Str()), ]) # read log data file df = spark.read.json(log_data, schema=logSchema) # filter by actions for song plays df = df.filter("page == 'NextSong'") # create temporary View for Log df.createOrReplaceTempView("logView") # extract columns for users table users_table = spark.sql(""" WITH latestChange AS ( SELECT userId AS userIdLatest, MAX(ts) AS maxTs FROM logView GROUP BY userId ) SELECT userId AS user_id, ts AS tsTemp, firstName AS first_name, lastName AS last_name, gender, level FROM logView AS t1 JOIN latestChange AS t2 ON t1.userId = t2.userIdLatest AND t1.ts = t2.maxTs WHERE userId IS NOT NULL """).dropDuplicates(['user_id']).drop("tsTemp") # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: getDateTime(int(x)), TST()) df = df.withColumn("start_time", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: getDateTime(int(x)), Dat()) df = df.withColumn("date_time", get_datetime(df.ts)) # extract columns to create time table time_table = df.select(col("start_time"), hour(df.start_time).alias("hour"), dayofmonth(df.date_time).alias("day"), weekofyear(df.date_time).alias("week"), month(df.date_time).alias("month"), year(df.date_time).alias("year"), date_format(df.date_time,"W").alias("weekday") ) \ .where("start_time is not null") \ .dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, 'time'), 'overwrite') # create temporary View for Log and Song tables df.createOrReplaceTempView("logView") songs_data.createOrReplaceTempView("songView") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT start_time, year(date_time) AS year, month(date_time) AS month, userId AS user_id, level, song_id, artist_id, sessionId AS session_id, location, userAgent AS user_agent FROM logView AS t1 JOIN songView AS t2 ON (t1.artist = t2.artist_name) AND (t1.song = t2.title) AND (t1.length = t2.duration) """) songplays_table.show(10) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( os.path.join(output_data, 'songplays'), 'overwrite')
data = data.fillna(0) data = data.rdd parse_time = udf(lambda time:dt.strptime(time, '%m/%d/%Y %I:%M:%S %p')) crime_data = data.map(lambda d:(int(d[0]),d[1],parse(d[2]),d[3],d[4],d[5],d[6],d[7],d[8],d[9],int(d[10]),int(d[11]),int(d[12]), int(d[13]),d[14],int(d[15]), int(d[16]),int(d[17]),parse(d[18]),float(d[19]),float(d[20]),d[21])) crime_df = sqlContext.createDataFrame(crime_data,["ID","Case Number","Date","Block","IUCR","Primary Type","Description", "Location Description","Arrest","Domestic","Beat","District","Ward","Community Area","FBI Code","X Coordinate", "Y Coordinate","Year","Updated On","Latitude","Longitude","Location"]) #Reduced level dataset for analysis crime_detail = crime_df.select(year(crime_df.Date).alias("Year"),month(crime_df.Date).alias("Month"),dayofmonth(crime_df.Date).alias("DoM"),date_format(crime_df.Date, 'EEEE').alias("DoW"),hour(crime_df.Date).alias("Hour"),crime_df.Block,crime_df["Primary Type"].alias("CrimeType"),crime_df.Description,crime_df["Location Description"].alias("LocDesc"), crime_df.Arrest,crime_df.Domestic,crime_df.Beat,crime_df.District,crime_df.Ward,crime_df["Community Area"].alias("CommunityArea"),crime_df.Latitude,crime_df.Longitude,crime_df.Location) crime_detail.registerTempTable("CrimeDetails") #Top level analysis for the chicago crimes dataset #print crime_detail.printSchema() print "Total Records: %d" % (crime_detail.count()) print "Distinct Year: %d" % (crime_detail.select('Year').distinct().count()) print "Distinct Hours: %d" % (crime_detail.select('Hour').distinct().count()) print "Distinct Type of crimes: %d" % (crime_detail.select('CrimeType').distinct().count()) print "Distinct Desc: %d" % (crime_detail.select('Description').distinct().count()) print "Distinct Blocks: %d" % (crime_detail.select('Block').distinct().count()) print "Distinct Loc Desc: %d" % (crime_detail.select('LocDesc').distinct().count())
SumLoad1 peakLoad1, SumLoad2 peakLoad2, SumLoad3 peakLoad3, SumLoad4 peakLoad4, SumLoad5 peakLoad5, SumLoadSecure peakLoadSecure FROM aggregatemaxdf """ featureeddf = spark.sql(sqlStatement) # Extract some time features from "SessionStartHourTime" column from pyspark.sql.functions import year, month, dayofmonth, hour featureeddf = featureeddf.withColumn('year', year(featureeddf['SessionStartHourTime'])) featureeddf = featureeddf.withColumn( 'month', month(featureeddf['SessionStartHourTime'])) featureeddf = featureeddf.withColumn( 'dayofmonth', dayofmonth(featureeddf['SessionStartHourTime'])) featureeddf = featureeddf.withColumn('hourofday', hour(featureeddf['SessionStartHourTime'])) featureeddf.write.mode('overwrite').partitionBy("dayofmonth").parquet( HourlyDFFile) # add day feature day = 3600 * 24 day_window = F.from_unixtime( F.unix_timestamp('SessionStartHourTime') - F.unix_timestamp('SessionStartHourTime') % day) featureeddf = featureeddf.withColumn('SessionStartDay', day_window) # aggregate daily featureeddf.createOrReplaceTempView("featureeddf") sqlStatement = """
def process_log_data(spark, input_data, output_data): """ This function uses the spark instance, reads the data from the s3 bucket and convert it into a spark dataframe. input: json file path for the log files from s3 bucket output: 1. parquet files for users table 2. parquet files for time table partitioned by 'year' and 'month' 3. parquet files for songplays table partitioned by 'year' and 'month' """ #get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df_log = spark.read.json(log_data) # filter by actions for song plays df_log = df_log[df_log['page'] == 'NextSong'] #extract columns for users table columns = df_log['userId', 'firstName', 'lastName', 'gender', 'ts', 'level'] users_table = columns.selectExpr("userId as user_id", "firstName as frist_name", "lastName as last_name", 'gender', 'level') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite') #converting the time variable ts into timestamp df_log = df_log.withColumn("timestamp", (col("ts").cast('bigint') / 1000).cast("timestamp")) # extract columns to create time table time_table = df_log.select( 'timestamp', hour('timestamp').alias('hour'), dayofmonth('timestamp').alias('day'), weekofyear('timestamp').alias('week'), month('timestamp').alias('month'), year('timestamp').alias('year'), date_format('timestamp', 'EEEE').alias('day_of_week')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time_df'), 'overwrite') # read in song data to use for songplays table song_data = os.path.join(input_data, "song_data/A/A/A/*.json") df_song = spark.read.json(song_data) #extract columns from joined song and log datasets to create songplays table songplays_table = df_log.join(df_song, (df_log.length == df_song.duration) & (df_log.artist == df_song.artist_name) & (df_log.song == df_song.title), 'left')\ .select(col('userId').alias('user_Id'), df_log.location, col('userAgent').alias('user_agent'), col('sessionId').alias('session_id'), df_song.artist_id, df_song.song_id, df_log.level, df_log.timestamp, year('timestamp').alias('year'), month('timestamp').alias('month')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, input_data, output_data): """ Read in log data from S3 to populate the users and time dimesion tables and the songplays fact table. Write to S3 parquet files representing the tables. :param spark: SparkSession object :param input_data: S3 path containing input song data to process :param output_data: S3 path to write parquet files :return: None """ # logs staging dataframe log_data = input_data + "log_data/*/*/*.json" log_schema = StructType([ StructField("artist", StringType()), StructField("auth", StringType()), StructField("firstName", StringType()), StructField("gender", StringType()), StructField("itemInSession", IntegerType()), StructField("lastName", StringType()), StructField("length", DoubleType()), StructField("level", StringType()), StructField("location", StringType()), StructField("method", StringType()), StructField("page", StringType()), StructField("registration", DoubleType()), StructField("sessionId", IntegerType()), StructField("song", StringType()), StructField("status", IntegerType()), StructField("ts", LongType()), StructField("userAgent", StringType()), StructField("userId", StringType()), ]) df = spark.read.json(log_data, schema=log_schema) df = df.where("page = 'NextSong'") # users dimension table # user_id, first_name, last_name, gender, level users_table = df.withColumn('max_ts', max('ts').over(Window.partitionBy('userId'))).where(col('ts') == col('max_ts')).drop('max_ts') users_table = users_table.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level") #users_table.write.parquet(output_data + "users") # time dimension table # start_time, hour, day, week, month, year, weekday get_timestamp = udf(lambda x: round(x / 1000), LongType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) get_datetime = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn("datetime", get_datetime(df.ts)) time_table = df.select("timestamp", "datetime").distinct() time_table = time_table.withColumn("hour", hour("datetime")) time_table = time_table.withColumn("day", dayofmonth("datetime")) time_table = time_table.withColumn("week", weekofyear("datetime")) time_table = time_table.withColumn("month", month("datetime")) time_table = time_table.withColumn("year", year("datetime")) time_table = time_table.withColumn("weekday", date_format("datetime", "u").cast(IntegerType())) time_table = time_table.drop("datetime") #time_table.write.partitionBy("year", "month").parquet(output_data + "time") # songplays fact table # songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent songs_df = spark.read.parquet(output_data + "songs") artists_df = spark.read.parquet(output_data + "artists") songplays_df = df.join(songs_df, df.song == songs_df.title, how="left") songplays_df = songplays_df.drop("artist_id", "year", "location") songplays_df = songplays_df.join(artists_df, songplays_df.artist == artists_df.name, how="left") songplays_df = songplays_df.withColumn("songplay_id", monotonically_increasing_id()) songplays_df = songplays_df.withColumn("year", year("datetime")) songplays_df = songplays_df.withColumn("month", month("datetime")) songplays_table = songplays_df.selectExpr("songplay_id","timestamp AS start_time", "userId AS user_id", "level", "song_id", \ "artist_id", "sessionId AS session_id", "location", "userAgent AS user_agent", \ "year", "month") songplays_table.write.partitionBy("year", "month").parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): """ This module processes the log_data from s3 and saves users and time details as parquet files. After that it combines the event logs with the song_data and creates the songplays output file also in parquet format. """ # get filepath to log data file log_data = input_data + 'log_data' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter('page=="NextSong"') # extract columns for users table users_table = df.select('userid', 'firstname', 'lastname', 'gender', 'level').distinct() # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users.parquet') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original timestamp column -- Don't need this #get_datetime = udf() #df = # extract columns to create time table time_table = df.select('timestamp') \ .withColumnRenamed('timestamp','start_time') \ .distinct() \ .withColumn('hour',hour(col('start_time'))) \ .withColumn('day',dayofmonth(col('start_time'))) \ .withColumn('week',weekofyear(col('start_time'))) \ .withColumn('month',month(col('start_time'))) \ .withColumn('year',year(col('start_time'))) \ .withColumn('weekday',dayofweek(col('start_time'))) # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'time.parquet') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/*/*/*') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df,(df.song == song_df.title) & (df.length == song_df.duration) & (df.artist == song_df.artist_name),'inner') \ .withColumn('songplays_id',monotonically_increasing_id()) \ .withColumn('start_time',get_timestamp('ts')) \ .select('songplays_id','start_time','userid','level','song_id','artist_id','sessionid','location','useragent') \ .withColumnRenamed('userid','user_id') \ .withColumnRenamed('sessionid','session_id') \ .withColumnRenamed('useragent','user_agent') \ .withColumn('month',month(col('start_time'))) \ .withColumn('year',year(col('start_time'))) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'songplays.parquet')
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, "log_data/*.json") song_data = os.path.join(input_data, "song_data", "*", "*", "*", "*.json") users_table_path = os.path.join(output_data, "users-table") time_table_path = os.path.join(output_data, "time-table") songsplay_table_path = os.path.join(output_data, "songsplay-table") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(F.col('page') == 'NextSong') # extract columns for users table users_table = df.select(F.col("userId").alias("user_id"), F.col("firstName").alias("first_name"), F.col("lastName").alias("last_name"), F.col("gender"), F.col("level")) # write users table to parquet files users_table.write.mode(mode).parquet(users_table_path) print("users_table saved") # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn("ts_timestamp", get_timestamp(F.col("ts"))) # extract columns to create time table time_table = df.select(F.col("ts_timestamp").alias("start_time")).distinct() time_table = time_table.select(F.col("start_time"), F.hour(F.col("start_time")).alias("hour"), F.dayofmonth(F.col("start_time")).alias("day"), F.weekofyear(F.col("start_time")).alias("week"), F.month(F.col("start_time")).alias("month"), F.year(F.col("start_time")).alias("year"), F.dayofweek(F.col("start_time")).alias("weekday")) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").mode(mode).parquet(time_table_path) print("time_table saved") # read in song data to use for songplays table song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, (df.artist == song_df.artist_name)&\ (df.length == song_df.duration)&\ (df.song == song_df.title), how="inner") songplays_table = songplays_table.select(F.monotonically_increasing_id().alias("songplay_id"), F.col("ts_timestamp").alias("start_time"), F.col("userId").alias("user_id"), F.col("level"), F.col("song_id"), F.col("artist_id"), F.col("sessionID").alias("session_id"), F.col("location"), F.col("userAgent").alias("user_agent"), F.month(F.col("ts_timestamp")).alias("month"), F.year(F.col("ts_timestamp")).alias("year")) # Missing drop duplicates # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").mode(mode).parquet(songsplay_table_path) print("songplays_table saved")
def process_log_data(spark, input_data, output_data): ''' Read log data and write users, time and songplays tables. Log data is read from any json files found under `input_data`/log_data. Data can be read and written from/to local files or S3 buckets ('s3a://'). Output data is written as parquet files. ''' # get filepath to log data file if input_data.startswith('s3a://'): # we are reading data from S3 log_data = list_matching_in_bucket(input_data, 'log_data/') else: # we are reading local files import glob glob_pattern = "{}/log_data/*/*/*.json".format(input_data) log_data = glob.glob(glob_pattern) if 0 == len(log_data): print("[ERROR] could not find any log data files:'{}'".format( glob_pattern)) exit(0) # read log data file df = spark.read.json(log_data) print("[INFO] read {} events".format(df.count())) print("[INFO] detected schema:") df.printSchema() df.show(5) # filter by actions for song plays df = df.filter(df.page == 'NextSong') print("[INFO] selected {} 'NextSong' events".format(df.count())) # extract columns for users table # Note: first sort and then selectively drop duplicates to only store # the most recent user `level`. users_table = df.sort('ts', ascending=False).select( df.userId.alias('user_id'), df.firstName.alias('first_name'), df.lastName.alias('last_name'), df.gender, df.level).dropDuplicates( ['user_id', 'first_name', 'last_name', 'gender']) print("[INFO] saving information for {} users".format(users_table.count())) print("[INFO] users_table schema:") users_table.printSchema() # write users table to parquet files users_table.write.parquet("{}/users".format(output_data), mode='overwrite') # create datetime column from original timestamp column import pyspark.sql.types as pstypes get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), pstypes.TimestampType()) df = df.withColumn('datetime', get_datetime(df.ts)) # df.printSchema() # df.show(2) # extract columns to create time table time_table = df.select(df.datetime.alias('start_time'), hour(df.datetime).alias('hour'), dayofmonth(df.datetime).alias('day'), weekofyear(df.datetime).alias('week'), month(df.datetime).alias('month'), year(df.datetime).alias('year'), date_format(df.datetime, 'E').alias('weekday')).dropDuplicates() print("[INFO] saving information for {} timestamps".format( time_table.count())) print("[INFO] time_table schema:") time_table.printSchema() time_table.show(5) # write time table to parquet files partitioned by year and month time_table.write.parquet("{}/times".format(output_data), partitionBy=['year', 'month'], mode='overwrite') # read in song data to use for songplays table song_df = spark.read.parquet("{}/songs".format(output_data)) artist_df = spark.read.parquet("{}/artists".format(output_data)) # extract columns from joined song and log datasets to create songplays # table songplays_table = df.join( artist_df, artist_df.name == df.artist, 'inner').join(song_df, [ song_df.artist_id == artist_df.artist_id, song_df.title == df.song, song_df.duration == df.length, ], 'inner').select( monotonically_increasing_id().alias('songplay_id'), df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id, artist_df.artist_id, df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), # needed for writing the tables partitioned month(df.datetime).alias('month'), year(df.datetime).alias('year'), ) print("[INFO] saving information for {} songplays".format( songplays_table.count())) print('[INFO] songplays_table schema:') songplays_table.printSchema() songplays_table.show(5) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet("{}/songplays".format(output_data), partitionBy=['year', 'month'], mode='overwrite')
# | GOOG| Sam|200.0| # | APPL| Linda|130.0| # | MSFT| Amy|124.0| # | GOOG|Charlie|120.0| # +-------+-------+-----+ # MISSING DATA # Display rows with at least 2 non-null values df.na.drop(thresh=2).show() # get rows with no null value df.na.drop(how='any').show() # don't drop any row df.na.drop(how='all').show() # drop row will null data in Sales df.na.drop(subset=['Sales']).show() # Fill in any string value df.na.fill('FILL VALUE').show() # Fill in any null num value df.na.fill(0).show() # Fill all null in Name column df.na.fill('No Name', subset=['Name']).show() mean_val = df.select(mean(df['Sales'])).collect() mean_sales = mean_val[0][0] df.na.fill(mean_sales, subset=['Sales']).show() # TIMESTAMP df.select(dayofmonth(df['Date'])).show()
# ---------------- error_df = logs_df.filter(logs_df['status'] == '200') error_paths_df = error_df.groupBy('path').count().sort('count', ascending=False) # error_paths_df.show(10, truncate=False) # Unique Hosts Count # ------------------- # unique_hosts_df = logs_df.dropDuplicates(['host']).count() # print(unique_hosts_df) # Unique host count by day # ------------------------- # logs_df.show(10, truncate=False) day_to_host_df = logs_df.select('host', dayofmonth('time').alias('day')).sort('day', ascending=False) # day_to_host_df.show(10, truncate=False) day_to_host_unique_df = day_to_host_df.dropDuplicates() daily_hosts_df = day_to_host_unique_df.groupBy('day').count() # daily_hosts_df.show(31, truncate=False) daily_hosts_df.cache() # Average requests per day total_req_per_day_df = logs_df.groupBy(dayofmonth('time').alias('day')).count() # total_req_per_day_df.show(31) avg_re_per_day_df = total_req_per_day_df.join(daily_hosts_df, ['day']).\ select('day', (total_req_per_day_df['count']/daily_hosts_df['count']).cast('integer').alias('Average_Requests')) avg_re_per_day_df.show(31, truncate=False)
def process_log_data(spark: SparkSession, input_data: str, output_data: str) -> None: """ Given an input path to log data, select relevant columns for user and time tables and save those to disk respecting an output path. Then load previously processed song and artist data, join it with log data, create the songplay table and write it to disk. :param spark: SparkSession :param input_data: Path to input data :param output_data: Path to store output data :return: None """ # get filepath to log data file log_data = f"{input_data}/log_data/*/*/" # read log data file print("Loading log data") df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table user_table = df.dropDuplicates(["userId"]).select( ["userId", "firstName", "lastName", "gender", "level"]) # write users table to parquet files print("Writing user table") user_table.write.parquet(f"{output_data}/users/", mode="overwrite") # create timestamp column from original timestamp column df = df.withColumn("ts", from_unixtime(df.ts / 1000)) # extract columns to create time table time_table = df.dropDuplicates(["ts"]).select([ "ts", hour(df.ts).alias("hour"), dayofmonth(df.ts).alias("day"), weekofyear(df.ts).alias("week"), month(df.ts).alias("month"), year(df.ts).alias("year"), dayofweek(df.ts).alias("weekday") ]) # write time table to parquet files partitioned by year and month print("Writing time table") time_table.write.partitionBy(["year", "month"]).parquet(f"{output_data}/time/", mode="overwrite") # read in song and artist data required for songplays table print("Loading song data") songs_table = spark.read.parquet(f"{output_data}/songs/") print("Loading artist data") artists_table = spark.read.parquet(f"{output_data}/artists/") # join datasets print("Joining song, artist and log data") join_cond = [ df.song == songs_table.title, df.artist == artists_table.artist_name, df.length == songs_table.duration ] joined_df = songs_table.join(artists_table, "artist_id", "inner").join(df, join_cond, "inner") # extract columns from joined song and log datasets to create songplays table songplays_table = joined_df \ .withColumn("songplay_id", monotonically_increasing_id()) \ .withColumn("year", year(df.ts).alias("year")) \ .withColumn("month", month(df.ts).alias("month")) \ .select(["songplay_id", "ts", "userId", "level", "song_id", "artist_id", "sessionId", "artist_location", "userAgent", "year", "month"]) # write songplays table to parquet files partitioned by year and month print("Writing songplay table") songplays_table.write.partitionBy(["year", "month" ]).parquet(f"{output_data}/songplays/", mode="overwrite") print("Finished processing log data")
def etl_world_temperature(spark, input_dir, output_dir): """Clean the temperature data""" # load data data_input_full_file_path = f'{input_dir}/GlobalLandTemperaturesByCity.csv' world_temperature_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(data_input_full_file_path) # just take temperature data after 2003-01-01 and only keep the US data world_temperature_spark_df = world_temperature_spark_df \ .filter(F.col('dt') >= datetime(2003, 1, 1)) \ .filter(F.col('Country') == 'United States') # parse month and day us_temperature_spark_df = world_temperature_spark_df \ .withColumn('month', F.month(F.col('dt'))) \ .withColumn('day', F.dayofmonth(F.col('dt'))) \ .drop(F.col('dt')) # groupby columns and get the new avg temperature avg_us_temperature_spark_df = us_temperature_spark_df \ .groupBy(['month', 'day', 'City', 'Country', 'Latitude', 'Longitude']) \ .agg(F.mean('AverageTemperature')) \ .withColumnRenamed('avg(AverageTemperature)', 'AverageTemperature') \ .withColumn('month', F.col('month').cast('integer')) \ .withColumn('day', F.col('day').cast('integer')) # covert DMS Lat and Lon to numeric format to get state info with an udf func avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumn('Latitude', F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')) .otherwise(-1*F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')))) \ .withColumn('Longitude', F.when(F.col('Longitude').rlike('W'), -1 * F.regexp_replace('Longitude', 'W', '').cast('double')) .otherwise(F.when(F.col('Longitude').rlike('W'), F.regexp_replace('Longitude', 'W', '').cast('double')))) # define a udf function to get state based on lat and lon by using reverse_geocoder library # https://github.com/thampiman/reverse-geocoder def _helper_get_state_(lat, lon): coor = (lat, lon) result = rg.search(coor) return result[0].get('admin1') _helper_get_state_udf = F.udf(lambda x, y: _helper_get_state_(x, y), StringType()) avg_us_temperature_spark_df = avg_us_temperature_spark_df\ .withColumn('state', _helper_get_state_udf(F.col('Latitude'), F.col('Longitude'))) # load i94addr dictionary - map the i94addr values i94addr_dictionary_input_full_file_path = f'{input_dir}/dictionary_data/i94addr_dictionary.csv' i94addr_dictionary_spark_df = spark \ .read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(i94addr_dictionary_input_full_file_path) i94addr_dictionary_spark_df = i94addr_dictionary_spark_df \ .withColumn('init_cap_value', F.initcap(F.col('value'))) avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .join(i94addr_dictionary_spark_df, avg_us_temperature_spark_df.state == i94addr_dictionary_spark_df.init_cap_value, 'left') \ .drop('init_cap_value') \ .drop('value') \ .withColumnRenamed('key', 'state_code') avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumnRenamed("Country", "country") \ .withColumnRenamed("City", "city") \ .withColumnRenamed("Latitude", "latitude") \ .withColumnRenamed("Longitude", "longitude") \ .withColumnRenamed("AverageTemperature", "avg_temperature") avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code')))) avg_us_temperature_spark_df = avg_us_temperature_spark_df.select( 'month', 'day', 'city', 'state', 'state_code', 'city_state_code', 'avg_temperature').distinct() # output clean data data_output_full_file_path = f'{output_dir}/USCitiesTemperaturesByMonth.parquet' avg_us_temperature_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def prepare_dwh_data(): df_dwh_fact_hotel = \ spark.read.parquet(config.get(config_set, 'dwh.fact.hotel.path')) \ .filter('start_date >= "2016-01-01"') \ .select('fact_hotel_id', 'start_date', 'end_date', 'effective_date', 'dim_booking_id', 'dim_supplier_id', 'dim_traveler_profile_id', F.col('issuing_country_id').alias('dim_location_id')) df_dwh_dim_booking = \ spark.read.parquet(config.get(config_set, 'dwh.dim.booking.path')) \ .filter('booking_locator is not null') \ .select('dim_booking_id', 'booking_locator') df_dwh_dim_supplier = \ spark.read.parquet(config.get(config_set, 'dwh.dim.supplier.path')) \ .filter('discontinue_date="2000-01-01" or discontinue_date="2999-12-31"') \ .select('harp_key', 'dim_supplier_id') df_dwh_dim_traveler_profile = \ spark.read.parquet(config.get(config_set, 'dwh.dim.traveler.profile.path')) \ .select('dim_traveler_profile_id', 'traveler_guid', F.col('country_code').alias('trav_country')) df_dwh_dim_location = spark.read.parquet(config.get(config_set, 'dwh.dim.location.path')) \ .select('dim_location_id', F.col('country_code').alias('loc_country')) regexp_pattern = '[^a-zA-Z0-9]+' df_dwh = \ df_dwh_fact_hotel \ .join(df_dwh_dim_booking, on='dim_booking_id') \ .join(df_dwh_dim_supplier, on='dim_supplier_id') \ .join(df_dwh_dim_traveler_profile, on='dim_traveler_profile_id') \ .join(df_dwh_dim_location, on='dim_location_id', how='left') \ .select('fact_hotel_id', 'start_date', 'end_date', 'effective_date', 'booking_locator', 'harp_key', 'traveler_guid', 'trav_country', 'loc_country', F.when(F.col('trav_country') == F.col('loc_country'), 'N').otherwise('Y').alias('emulation_flag') ) \ .withColumn('concat_dwh_pnr', F.concat('start_date', 'end_date', 'booking_locator')) \ .withColumn('full_mk', F.regexp_replace(F.concat('concat_dwh_pnr', 'harp_key', 'traveler_guid'), regexp_pattern, '')) \ .withColumn('prop_mk', F.regexp_replace(F.concat('concat_dwh_pnr', 'harp_key'), regexp_pattern, '')) \ .withColumn('pnr_mk', F.regexp_replace(F.concat('concat_dwh_pnr'), regexp_pattern, '')) \ .drop('concat_dwh_pnr') df_dwh_deduped = df_dwh \ .withColumn('rk', F.rank().over(Window .partitionBy(F.col('full_mk')) .orderBy(F.col('effective_date').desc(), F.col('fact_hotel_id').desc()))) \ .filter(F.col('rk') == 1) \ .drop('rk') \ .distinct() \ .withColumn('effective_date_year', F.year('effective_date')) \ .withColumn('effective_date_month', F.month('effective_date')) \ .withColumn('effective_date_day', F.dayofmonth('effective_date')) # sbx_dst.sw_hotel_hub_dwh_mapping_new_1 dump_partitioned_dataframe(df_dwh_deduped, ['effective_date_year', 'effective_date_month', 'effective_date_day'], config.get(config_set, 'hotel.hub.dwh.mapping.new.1.path') )
def process_log_data(spark, input_data, output_data): """ Process log data from Sparkify data warehouse. Read in the log data and filter on actions for song plays only. Extract user data, filtering out empty user ids and duplicates to create a *users* table, and write to parquet files. Create new columns for timestamp and datetime. Extract the start time (timestamp) and create a *time* table with additional columns for hour, day, week, month, year, weekday. Write *time* table to parquet files partitioned by year and month. Load song data and join with log data, and extrace fact-based data to create a *songplays* table. Create an incremental songplay id column and write the *songplays* table to parquet files partitioned by year and month. Parameters: spark : SparkSession object input_data : filepath to log data directories on S3 output_data : filepath to table directories on S3 for storing the partitioned parquet files Returns: none """ print("---[ process_log_data ]---") # get filepath to log data file # log_data = input_data + "log-data/*/*/*.json" # with S3 bucket log_data = input_data + "log-data/*.json" # local workspace # read log data file df_log_data = spark.read.json(log_data) # filter by actions for song plays df_log_data = df_log_data.where("page = 'NextSong'") # extract columns for users table users_table = df_log_data \ .filter('userId != ""') \ .select(col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender'), col('level') ) \ .dropna(how = "any", subset = ["user_id"]) \ .dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + "users") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.datetime.fromtimestamp(int(x / 1000)) \ .strftime('%Y-%m-%d %H:%M:%S')) df_log_data = df_log_data.withColumn( "timestamp" , to_timestamp(get_timestamp(df_log_data.ts))) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.datetime.fromtimestamp(int(x / 1000)) \ .strftime('%Y-%m-%d %H:%M:%S')) df_log_data = df_log_data.withColumn( "datetime" , get_datetime(df_log_data.ts)) # extract columns to create time table time_table = df_log_data.select \ ( col('timestamp').alias('start_time') , hour('datetime').alias('hour') , dayofmonth('datetime').alias('day') , weekofyear('datetime').alias('week') , month('datetime').alias('month') , year('datetime').alias('year') , date_format('datetime', 'F').alias('weekday') ) # write time table to parquet files partitioned by year and month time_table.write \ .partitionBy("year", "month") \ .parquet(output_data + "time") # read in song data to use for songplays table song_df = spark.read.json(input_data + "song_data/*/*/*/*.json") # extract columns from joined song and log datasets to create songplays table songplays_table = df_log_data \ .join( song_df , (df_log_data.song == song_df.title) & \ (df_log_data.artist == song_df.artist_name) , 'left_outer') \ .select( col("timestamp").alias("start_time") , col("userId").alias("user_id") , df_log_data.level , song_df.song_id , song_df.artist_id , col("sessionId").alias("session_id") , df_log_data.location , col("useragent").alias("user_agent") , year("datetime").alias("year") , month("datetime").alias("month") ) # EXTRA step: add songplay_id column to the songplays table songplays_table = songplays_table \ .select( 'start_time', 'user_id', 'level', 'song_id' , 'artist_id', 'session_id', 'location', 'user_agent' , 'year', 'month' , F.row_number() \ .over( Window.partitionBy("year", "month") \ .orderBy( col("start_time").desc() , col("user_id").desc() ) ) \ .alias("songplay_id") ) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month") \ .parquet(output_data + "songplays")
day_to_host_pair_tuple_df = logs_df.<FILL IN> day_group_hosts = day_to_host_pair_tuple_df.<FILL IN> day_host_count_df = day_group_hosts.<FILL IN> daily_hosts_df = <FILL IN> print 'Unique hosts per day:' daily_hosts_df.show(30, False) # COMMAND ---------- # ANSWER from pyspark.sql.functions import dayofmonth day_to_host_pair_tuple_df = logs_df.select("host", dayofmonth("time").alias('day')) day_group_hosts = day_to_host_pair_tuple_df.distinct() day_host_count_df = day_group_hosts.groupBy("day").count().orderBy("day") daily_hosts_df = day_host_count_df.cache() print 'Unique hosts per day:' daily_hosts_df.show(30, False) # COMMAND ---------- # TEST Number of unique daily hosts (4c) daily_hosts_list = (daily_hosts_df .map(lambda r: (r[0], r[1]))
# 4b # Number of unique hosts unique_host_count = logs_df.select(col('host')).distinct().count() print 'Unique hosts: {0}'.format(unique_host_count) # 4c # Unique daily hosts from pyspark.sql.functions import dayofmonth day_to_host_pair_df = logs_df.select(logs_df.host, dayofmonth(logs_df.time).alias('day')).cache() day_group_hosts_df = day_to_host_pair_df.distinct() daily_hosts_df = day_group_hosts_df.groupBy('day').count().sort('day', ascending = True).cache() print 'Unique hosts per day:' daily_hosts_df.show(30, False) # 4d # Prepare arrays for plotting days_with_hosts = daily_hosts_df.map(lambda r: (r[0])).take(30) hosts = daily_hosts_df.map(lambda r: (r[1])).take(30) # for <FILL IN>: # <FILL IN>