def run_pipeline(name: str, data: str, save: str) -> None: spark = SparkSession.builder.appName(name).getOrCreate() # Dataset Creation # # read bike ride history csv's df = spark.read.csv(f'{data}/rides/*', header=True) df = df.select(['Duration', 'Start date', 'Start station number', 'Member type']) df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType())) print(f'The rides dataset has [{df.count()}] rows!') # read station information csv stations = spark.read.csv(f'{data}/stations/*', header=True) print(f'The stations dataset has {stations.count()} rows!') stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat') stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long') stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType())) stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number']) # remove rides longer than 1.5 hours one_and_a_half_hours = 60 * 60 * 1.5 df = df.filter(df['Duration'] <= one_and_a_half_hours) # remove rides shorter than 3 minutes three_minutes = 60 * 3 df = df.filter(df['Duration'] >= three_minutes) # remove unknown 'Member type's df = df.filter(df['Member type'] != 'Unknown') # remove non-existent stations df = df.filter(~(df['Start station number'] == 31008) & ~( df['Start station number'] == 32051) & ~(df['Start station number'] == 32034)) # make target feature df = df.withColumn('label', F.log1p(df.Duration)) # join on 'Start station number' print('Merging rides and stations dataframes!') df = df.join(stations, on='Start station number') df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType())) df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType())) print(f'Complete rides and stations dataset has {df.count()} rows!') # Feature Transformations # print('Doing Feature Transformations!') # convert to datetime type df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss')) df = df.withColumn('day_of_week', F.dayofweek('Start date')) df = df.withColumn('week_of_year', F.weekofyear('Start date')) df = df.withColumn('month', F.month('Start date')) df = df.withColumn('minute', F.minute('Start date')) df = df.withColumn('hour', F.hour('Start date')) # make time features cyclical pi = 3.141592653589793 df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60)) df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24)) df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60)) df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24)) df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType())) df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType())) # drop unused columns drop_columns = [ 'Start date', 'Start station number', 'Duration', 'day_of_week', 'week_of_year', 'month', 'minute', 'hour' ] df = df.drop(*drop_columns) # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show() # Model and Pipeline # # split training and test train, test = df.randomSplit([.7, .3]) # encode categorical column 'Member type' member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx') member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc') # create vector of features named 'features' vector = VectorAssembler( inputCols=[ 'start_station_lat', 'start_station_long', 'sin_day_of_week', 'cos_day_of_week', 'sin_week_of_year', 'cos_week_of_year', 'sin_month', 'cos_month', 'sin_minute', 'cos_minute', 'sin_hour', 'cos_hour', 'member_enc' ], outputCol='features' ) # scale features scaler = StandardScaler( inputCol='features', outputCol='scaled_features' ) # define model model = GeneralizedLinearRegression( featuresCol='scaled_features' ) # create pipeline and fill in stages pipeline = Pipeline( stages=[ member_indexer, member_encoder, vector, scaler, model ] ) # evaluation method evaluation = RegressionEvaluator() # best parameter search grid = ParamGridBuilder() # grid = grid.addGrid(model.maxDepth, [5, 7]) # grid = grid.addGrid(model.numTrees, [200, 500]) grid = grid.addGrid(model.maxIter, [40, 50]) grid = grid.addGrid(model.family, ['gaussian', 'gamma']) grid = grid.addGrid(model.regParam, [0.0, 0.1]) grid = grid.build() # run cross validation cv = CrossValidator( estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluation, numFolds=7 ) print('Doing Cross Validation!') cv_models = cv.fit(train) print(f'CV results: {cv_models.avgMetrics} (RMSE)') best_model = cv_models.bestModel best_params = extract_best_params(best_model.stages[-1].extractParamMap()) print(f'Best params:\n{best_params}') results = cv_models.transform(test) print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)') print('Re-fitting pipeline on entire dataset!') cv_models = cv.fit(df) print('Saving to pipeline into S3!') entire_dataset_best_model = cv_models.bestModel entire_dataset_best_model.save(f'{save}/{name}') print('Done!') return
def process_log_data(spark, input_data, output_data): """ This function uses SparkSession to read raw songs data from S3, transform it and save it back to S3 in the form of three dimension tables: users, time and songplays. The last two are partitioned by year and month. """ # set filepath to folder containing all log data files log_data = input_data + "log_data/*/*/*.json" # read log data file and filter it by song plays only print("Loading log data from " + input_data + "...") log_df = spark.read.json(log_data) print("Loading complete!") log_df = log_df.filter(log_df.page == 'NextSong') # extract columns for users table users_table = log_df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']).dropDuplicates() print("Writing users data into " + output_data) # write users table to partitioned parquet files to optimise and save space users_table.write.format('parquet').mode('overwrite').save(output_data + 'users') print("Writing complete!") # create timestamp column from original unix timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), TimestampType()) log_df = log_df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original unix timestamp column get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), DateType()) log_df = log_df.withColumn('datetime', get_datetime('ts')) # extract several time measurements from timestamp to create several columns time_table = log_df.select(['ts', 'timestamp']).withColumn( 'year', year('timestamp')).withColumn('month', month('timestamp')).withColumn( 'week', weekofyear('timestamp')).withColumn( 'weekday', dayofweek('timestamp')).withColumn( 'day', dayofmonth('timestamp')).withColumn( 'hour', hour('timestamp')).dropDuplicates() print("Writing time data into " + output_data) # write time table to parquet files partitioned by year and month time_table.write.format('parquet').partitionBy( ['year', 'month']).mode('overwrite').save(output_data + 'time') print("Writing complete!") # read songs data from S3 once again song_data = input_data + "song-data/*/*/*/*.json" print("Loading songs data from " + input_data + " ...") songs_df = spark.read.json(song_data) # extract columns from both song and log datasets via a SQL query to create the songplays table songs_df.createOrReplaceTempView('songs') log_df.createOrReplaceTempView('log') songplays_table = spark.sql(""" SELECT l.timestamp as start_time, year(l.timestamp) as year, month(l.timestamp) as month, l.userId as user_id, l.level as level, s.song_id, s.artist_id, l.sessionId as session_id, s.artist_location as location, l.userAgent as user_agent FROM log l JOIN songs s ON s.title = l.song AND l.artist = s.artist_name """).dropDuplicates() print("Writing songplay data into " + output_data) # write songplays table to partitioned parquet files to optimise and save space songplays_table.write.partitionBy([ 'year', 'month' ]).format('parquet').mode('overwrite').save(output_data + 'songplays') print("Writing complete!")
month(datetime) AS month, year(datetime) AS year, dayofweek(datetime) AS weekday FROM staging_events ORDER BY start_time """) time_table.printSchema() # + time_table = df2.select( col('datetime').alias('start_time'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), dayofweek('datetime').alias('weekday') ) time_table = time_table.drop_duplicates(subset=['start_time']) time_table.printSchema() # + # time_table.select('hour').limit(1).show(1) # + # s_s3_path = "{}/time.parquet".format(output_data) # time_table.write.mode('overwrite').partitionBy( # 'year', 'month').parquet(s_s3_path)
def process_log_data(spark, input_data, output_data): ''' Description: process_song_data function loads data from a s3 bucket from input_data, process all the data and stores the results in another s3 bucket located in output_data Parameters: spark : Spark Session input_data : s3 Address where song_data json files are stored (in this specific scenario, a public s3 bucket) output_data : S3 bucket were dimensional tables in parquet format will be stored after processed ''' # get filepath to log data file log_data = input_data + 'data/log-data/*' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.selectExpr('userId AS user_id', 'firstName AS first_name', 'lastName AS last_name', 'gender', 'level').drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/') # create timestamp column from original timestamp column df = df.withColumn( 'start_time', to_timestamp( from_unixtime((col("ts") / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) # extract columns to create time table time_table = df.select('start_time', hour('start_time').alias('hour'), dayofmonth('start_time').alias('day'), weekofyear('start_time').alias('week'), month('start_time').alias('month'), year('start_time').alias('year'), dayofweek('start_time').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'time/') # read in song data to use for songplays table (reading the parquet files stored before) song_df = spark.read.parquet( output_data + 'songs/*/*/*' ) # Partitioned by year and artist_id, only two levels to read # extract columns from joined song and log datasets to create songplays table # Final Columns - songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent # Creating an id for songplays_table -> songplays_table = df.withColumn('songplay_id', monotonically_increasing_id()) \ .join(song_df, song_df.title == df.song) \ .select('songplay_id', 'start_time', col('userId').alias('user_id'), 'level', 'song_id', col('artist').alias('artist_id'), col('sessionID').alias('session_id'), 'location', col('userAgent').alias('user_agent'), month('start_time').alias('month'), year('start_time').alias('year')) # write songplays table to parquet files songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file log_data_raw = spark.read.json(log_data) # filter by actions for song plays log_data_stage = log_data_raw.filter(log_data_raw.page == 'NextSong') # extract columns for users table users_table = log_data_stage.select( col('userId').alias('user_id'), col('firstName').alias('firt_name'), col('lastName').alias('last_name'), col('gender'), col('level')).distinct() users_table.show(5, truncate=False) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000)) log_data_stage = log_data_stage.withColumn( 'timestamp', get_timestamp(log_data_stage.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) log_data_stage = log_data_stage.withColumn('date', get_datetime(log_data_stage.ts)) # extract columns to create time table time_table = log_data_stage.select( col('date').alias('start_time'), hour('date').alias('hour'), dayofmonth('date').alias('day'), weekofyear('date').alias('week'), month('date').alias('month'), year('date').alias('year'), date_format('date', 'EEEE').alias('weekday')).distinct().orderBy('start_time') time_table.show(5, truncate=False) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time'), 'overwrite') # read in song data to use for songplays table song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') songs_stage = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = log_data_stage.join( songs_stage, (log_data_stage.artist == songs_stage.artist_name) & (log_data_stage.song == songs_stage.title)) songplays_table = songplays_table.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = songplays_table.select( col('songplay_id'), col('date').alias('start_time'), year('date').alias('year'), month('date').alias('month'), col('userId').alias('user_id'), col('level'), col('song_id'), col('artist_id'), col('sessionId').alias('session_id'), col('location'), col('userAgent').alias('user_agent')).orderBy('user_id', 'session_id') songplays_table.show(5, truncate=False) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, input_data, output_data): """ Transform raw log data from S3 into analytics tables on S3 This function reads in log data in JSON format from S3; defines the schema of songplays, users, and time analytics tables; processes the raw data into those tables; and then writes the tables into partitioned parquet files on S3. Args: spark: a Spark session input_data: an S3 bucket to read log data in from output_data: an S3 bucket to write analytics tables to """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file log_data_schema = StructType([ StructField("artist", StringType(), True), StructField("auth", StringType(), False), StructField("firstName", StringType(), True), StructField("gender", StringType(), True), StructField("itemInSession", IntegerType(), False), StructField("lastName", StringType(), True), StructField("length", DoubleType(), True), StructField("level", StringType(), False), StructField("location", StringType(), True), StructField("method", StringType(), False), StructField("page", StringType(), False), StructField("registration", DoubleType(), True), StructField("sessionId", IntegerType(), False), StructField("song", StringType(), True), StructField("status", IntegerType(), False), StructField("ts", DoubleType(), False), StructField("userAgent", StringType(), True), StructField("userId", StringType(), True) ]) df = spark.read.json(log_data, schema=log_data_schema) # filter by actions for song plays df = df.filter(col("page") == "NextSong") # extract columns for users table users_table = (df.withColumn( "max_ts_user", max("ts").over(Window.partitionBy("userID"))).filter( (col("ts") == col("max_ts_user")) & (col("userID") != "") & (col("userID").isNotNull())).select( col("userID").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level")) # write users table to parquet files users_table.write.parquet(output_data + "users_table.parquet", mode="overwrite") # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).replace(microsecond=0), TimestampType()) df = df.withColumn("start_time", get_datetime("ts")) # extract columns to create time table time_table = (df.withColumn("hour", hour("start_time")).withColumn( "day", dayofmonth("start_time")).withColumn( "week", weekofyear("start_time")).withColumn( "month", month("start_time")).withColumn( "year", year("start_time")).withColumn( "weekday", dayofweek("start_time")).select( "start_time", "hour", "day", "week", "month", "year", "weekday").distinct()) # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time_table.parquet", mode="overwrite", partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs_table.parquet") # extract columns from joined song and log datasets to create songplays table artists_table = spark.read.parquet(output_data + "artists_table.parquet") songs = (song_df.join(artists_table, "artist_id", "full").select("song_id", "title", "artist_id", "name", "duration")) songplays_table = df.join(songs, [ df.song == songs.title, df.artist == songs.name, df.length == songs.duration ], "left") songplays_table = (songplays_table.join( time_table, "start_time", "left").select("start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent"), "year", "month").withColumn("songplay_id", monotonically_increasing_id())) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays_table.parquet", mode="overwrite", partitionBy=["year", "month"])
display() # COMMAND ---------- # MAGIC %md ## Feature Engineering # MAGIC In the remaining part of the Notebook #1, we will demonstrate how to generate new features for this kind of use case. It is definitely not meant to be a comprehensive list. # COMMAND ---------- # MAGIC %md In the following cell, we created some time features, calculated the total number of warning_type1 (type2) occured for a macine on a particular day. We also identified some data quality issue that some event counts had negative values. # COMMAND ---------- # Extract some time features from "date" column df = df.withColumn('month', month(df['date'])) df = df.withColumn('weekofyear', weekofyear(df['date'])) df = df.withColumn('dayofmonth', dayofmonth(df['date'])) # warning related raw features warning_type1_features = list(s for s in df.columns if "warning_1_" in s) warning_type2_features = list(s for s in df.columns if "warning_2_" in s) warning_all = warning_type1_features + warning_type2_features # total count of all type1 warnings each day each device df = df.withColumn('warn_type1_total', sum(df[col_n] for col_n in warning_type1_features)) # total count of all type2 warnings each day each device df = df.withColumn('warn_type2_total', sum(df[col_n] for col_n in warning_type2_features))
def process_log_data(spark, input_data, output_data): """import log dataset extract columns create users, time, and songplays table Parameters: spark: name of spark session input_data: location of the source data s3 bucket output_data: location of the destination data s3 bucket Returns: writes users table in parquet to output_data location + users writes time_table in parquet to output_data location + time writes songplays table in parquest to output_data location + songplays """ """Setting up the JSON table structure for the log dataset""" log_dataset_schema = R([ Fld("artist", Str()), Fld("auth", Str()), Fld("firstName", Str()), Fld("gender", Str()), Fld("iteminSession", Lng()), Fld("lastName", Str()), Fld("length", Dbl()), Fld("level", Str()), Fld("location", Str()), Fld("method", Str()), Fld("page", Str()), Fld("registration", Dbl()), Fld("sessionId", Lng()), Fld("song", Str()), Fld("status", Lng()), Fld("ts", Lng()), Fld("userAgent", Str()), Fld("userId", Str()), ]) # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data, schema=log_dataset_schema) # filter by actions for song plays df = df.filter(col("page") == 'NextSong') # extract columns for users table users_table = df.select( col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level") # drop duplicate rows in users table users_table = users_table.dropDuplicates() # write users table to parquet files users_table.write.mode('append').parquet(output_data + "users") # create timestamp column from original timestamp column dfTimestamp = df.withColumn("start_time", to_timestamp(col("ts") / 1000)) dfTimestamp = dfTimestamp \ .withColumn("hour", hour("start_time")) \ .withColumn("day", dayofmonth("start_time")) \ .withColumn("week", weekofyear("start_time")) \ .withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) \ .withColumn("weekday", date_format("start_time", 'E')) # extract columns to create time table time_table = dfTimestamp.select(col("start_time"), \ col("hour"), \ col("day"), \ col("week"), \ col("month"), \ col("year"), \ col("weekday")) # drop duplicate rows in time table time_table = time_table.dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.mode('append').partitionBy( "year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, song_df.artist_name==df.artist). \ withColumn("songplay_id", monotonically_increasing_id()). \ withColumn('start_time', to_timestamp(col("ts") /1000 )). \ select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # drop duplicate rows in songplays table songplays_table = songplays_table.dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('append').partitionBy( 'year', 'month').parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): ''' Function to read log data from s3 bucket,performs transformations and loads to target tables ''' # get filepath to log data file log_data = os.path.join(input_data,"log_data/*/*/*.json") # read log data file df_log = spark.read.json(log_data) # filter by actions for song plays and also filter the records where userId is null df_log = df_log.filter(col("page")=='NextSong').filter(df_log.userId.isNotNull()) # extract columns for users table users_table = df_log.select(col("userId").alias("user_id"),col("firstName").alias("first_name"),\ col("lastName").alias("last_name"), "gender", "level").dropDuplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df_log = df_log.withColumn("timestamp",get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df_log = df_log.withColumn("datetime", get_datetime(col("ts"))) # extract columns to create time table time_table = df_log.select( col('datetime').alias('start_time'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'F').alias('weekday') ) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year','month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite') # read in song data to use for songplays table song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table df_songplay = song_df.join(df_log, (song_df.artist_name == df_log.artist) & (song_df.title == df_log.song) & (song_df.duration == df_log.length)) songplays_table = df_songplay.select( monotonically_increasing_id().alias('songplay_id'), col('datetime').alias('start_time'), col('level').alias('level'), col('userId').alias('user_id'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), month('datetime').alias('month') ).dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite')
def process_log_data(spark, input_data, output_data): """ Process log data from json files and create users, time, and songplays tables in parquet. """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json("{}log_data/*/*/*.json".format(log_data)) # filter by actions for song plays df = df.filter(df['page'] == "NextSong") # extract columns for users table users_table = df.withColumn("last_stamp", max_(col('ts')).over(Window.partitionBy("userId"))) \ .filter(col('ts') == col('last_stamp')) \ .select('userId', 'firstName', 'lastName', 'gender', 'level') # write users table to parquet files users_table.write.parquet(os.path.join( output_data, 'users'), mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp( x/1000.0), TimestampType()) df = df.withColumn("timestamp", get_timestamp(col('ts'))) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000.0), DateType()) df = df.withColumn("datetime", get_datetime(col('ts'))) # extract columns to create time table time_table = df.withColumn('hour', hour(df.timestamp)) \ .withColumn('day', dayofmonth(df.timestamp)) \ .withColumn('week', weekofyear(df.timestamp)) \ .withColumn('month', month(df.timestamp)) \ .withColumn('year', year(df.timestamp)) \ .withColumn('weekday', date_format('timestamp', 'u')) \ .select('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday').distinct() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time_tbl'), mode='overwrite') # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs')) artist_df = spark.read.parquet(os.path.join(output_data, 'artists')) # extract columns from joined song and log data`sets to create songplays table songplays_table = df.join(song_df, (df.song == song_df.title) & (df.length == song_df.duration), 'left_outer') \ .join(artist_df, (song_df.artist_id == artist_df.artist_id) & (df.artist == artist_df.artist_name), 'left_outer') \ .select( df.timestamp.alias("start_time"), df.userId.alias("user_id"), df.level, song_df.song_id, song_df.artist_id, df.sessionId.alias( "session_id"), df.location, df.userAgent.alias("user_agent") ).withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.join(time_table, (songplays_table.start_time == time_table.timestamp)) \ .select(songplays_table["*"], time_table.year, time_table.month) \ .write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays'), mode='overwrite')
def process_log_data(spark, input_data, output_data): """ :param spark: spark session to process data :param input_data: s3 bucket where contains log data files :param output_data: file path where result of this function will save files """ # get filepath to log data file log_data = f"{input_data}{default_log_subpath}" # read log data file df = spark.read.json(log_data, mode='PERMISSIVE').drop_duplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = (df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates()) # write users table to parquet files users_table.write.parquet(os.path.join(f"{output_data}/users/"), mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # create datetime column from original timestamp column get_date = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), DateType()) df = df.withColumn("start_date", get_date("ts")) # create year and month columns to use as partition key in songplays_table df = (df.withColumn("datetime_year", year("start_time")).withColumn("datetime_month", month("start_time"))) # extract columns to create time table time_table = (df.withColumn("hour", hour("start_time")).withColumn( "day", dayofmonth("start_time")).withColumn( "week", weekofyear("start_time")).withColumnRenamed( "datetime_year", "year").withColumnRenamed("datetime_month", "month").select( "start_time", "hour", "day", "week", "month", "year").drop_duplicates()) # write time table to parquet files partitioned by year and month time_table.write.parquet(os.path.join(f"{output_data}/time_table/"), mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = (spark.read.format("parquet").option( "basePath", os.path.join(output_data, "songs/")).load( os.path.join(f"{output_data}/songs{default_log_subpath}/"))) # extract columns from joined song and log datasets to create songplays table songplays_table = (df.join( song_df, df.song == song_df.title, how='inner').select(monotonically_increasing_id().alias("songplay_id"), df.start_time, df.userId, df.level, song_df.song_id, song_df.artist_id, df.sessionId, df.location, df.userAgent, df.datetime_year, df.datetime_month)) # write songplays table to parquet files partitioned by year and month (songplays_table.withColumnRenamed( "datetime_year", "year").withColumnRenamed("datetime_month", "month").withColumnRenamed( "userId", "user_id").withColumnRenamed( "sessionId", "session_id").withColumnRenamed( "userAgent", "user_agent").drop_duplicates().write.parquet( os.path.join(f"{output_data}/songplays/"), mode="overwrite", partitionBy=["year", "month"])) return df, users_table, time_table, song_df, songplays_table
def process_log_data(spark, song_input_data, log_input_data, output_data): """ Reads JSON files located at log data directory. Filters log data to only contain song plays (page = NextSong) Extracts relevent columns for "users", removes duplicates. Writes user data. Computes timestamp and datetime data from original timestamp, and adds to dataframe. Extracts datetime column, and decomposes into date heirarchy to create new "time" table. Writes time data, partitioned by year and month. Reads JSON files located at song data directory. Joins log data frame to songs data frame, on song title and artist name match. Forms somngplays table, creating aliases for column names, and adding unique primary key to each record. Writes songplay data, partitioned by year and month. """ # get filepath to log data file log_data = log_input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = users_table.dropDuplicates(['userId', 'level']) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) df = df.withColumn('datetime', get_datetime(df.ts)) # extract columns to create time table time_table = df.select('datetime') \ .withColumn('start_time', df.datetime) \ .withColumn('hour', hour('datetime')) \ .withColumn('day', dayofmonth('datetime')) \ .withColumn('week', weekofyear('datetime')) \ .withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .withColumn('weekday', dayofweek('datetime')) time_table = time_table.dropDuplicates(['datetime']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.json(song_input_data) # extract columns from joined song and log datasets to create songplays table songplays_df = df.join(song_df, (df.artist == song_df.artist_name) & (df.song == song_df.title)) songplays_df = songplays_df.withColumn('month', month('datetime')) songplays_df = songplays_df.withColumn('songplay_id', monotonically_increasing_id()) songplays_table = songplays_df.select( col('songplay_id').alias('songplay_id'), col('datetime').alias('start_time'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), col('month').alias('month')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite') print("Songplays table written.")
def process_log_data(spark, input_data, output_data): """This function will get the data from Udacity s3 bucket available for this project. Extract data from the log_data path, select the columns that the project requires and create the output tables in parquet files for the artist and song table.""" # get filepath to log data file log_data = input_data + "log-data" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col("page")=='NextSong').filter(df.userId.isNotNull()) # extract columns for users table users_table = df.select(col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level").dropDuplicates() # write users table to parquet files print("""##### [STARTING] Writing table to the parquet files: USERS ##### """) users_table.write.mode("overwrite").parquet(output_data+"users") print("""##### [FINISHED] Table USERS already loaded ##### """) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn("timestamp",get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(col("ts"))) # extract columns to create time table time_table = df.select( 'timestamp', hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'F').alias('weekday') ) # write time table to parquet files partitioned by year and month print("""##### [STARTING] Writing table to the parquet files: TIME ##### """) time_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"time") print("""##### [FINISHED] Table TIME already loaded ##### """) # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table # Creating a string variable with the timestamp format tsFormatVar = "yyyy/MM/dd HH:MM:ss z" '''In this part, the songplays table are made with a join between 2 dataframes, and after that the columns select with the transformation (if the transformation applies)''' songplays_table = song_df.join(df,(song_df.artist_name==df.artist) & (song_df.title==df.song)).withColumn("songplay_id",monotonically_increasing_id()).withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormatVar),tsFormatVar)).select("songplay_id","start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),col("artist_location").alias("location"),"userAgent",month(col("start_time")).alias("month"),year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month print("""##### [STARTING] Writing table to the parquet files: SONGPLAYS ##### """) songplays_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"songplays") print("""##### [FINISHED] Table SONGPLAYS already loaded ##### """)
def process_log_data(spark, input_data, output_data): """ Description: Process the event log file and extract data for table time, users and songplays from it. :param spark: a spark session instance :param input_data: input file path :param output_data: output file path """ # get filepath to log data file log_data = os.path.join(input_data, "log-data/") # read log data file df = spark.read.json( log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record').drop_duplicates() # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, "users/"), mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table time_table = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time"))\ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.parquet(os.path.join(output_data, "time_table/"), mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read\ .format("parquet")\ .option("basePath", os.path.join(output_data, "songs/"))\ .load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.drop_duplicates().write.parquet( os.path.join(output_data, "songplays/"), mode="overwrite", partitionBy=["year", "month"])
def process_log_data(spark, input_data, output_data): """ Process log data in s3 bucket by extracting users table, time table and songplays table. Normalize and transform data to be store as parquet files in s3 bucket :param spark: a spark session :param input_data: data from s3 bucket location :param output_data: data to s3 bucket location :return: None """ # get filepath to log data file log_data = input_data + "data/log_data/*.json" # setting schema types for log data fields log_data_schema = StructType([ StructField("artist", StringType(), True), StructField("auth", StringType(), True), StructField("firstName", StringType(), True), StructField("gender", StringType(), True), StructField("itemInSession", LongType(), True), StructField("lastName", StringType(), True), StructField("length", DoubleType(), True), StructField("level", StringType(), True), StructField("location", StringType(), True), StructField("method", StringType(), True), StructField("page", StringType(), True), StructField("registration", DoubleType(), True), StructField("sessionId", LongType(), True), StructField("song", StringType(), True), StructField("status", LongType(), True), StructField("ts", LongType(), True), StructField("userAgent", StringType(), True), StructField("userId", StringType(), True), ]) # read log data file log_df = spark.read.json(log_data, schema=log_data_schema) # columns to be used firstName = 'firstName' gender = 'gender' lastName = 'lastName' level = 'level' userId = 'userId' start_time = 'start_time' hour = 'hour' day = 'day' week = 'week' month = 'month' year = 'year' weekday = 'weekday' # filter by actions for song plays log_df = log_df.filter(log_df.page == 'NextSong') # extract columns for users table users_table = log_df.select(firstName, lastName, gender, level, userId) # write users table to parquet files users_table.write.parquet(f"{output_data}users_table.parquet") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000), TimestampType()) log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: F.to_date(x), TimestampType()) log_df = log_df.withColumn("start_time", get_datetime(log_df.ts)) # extract columns to create time table log_df = log_df.withColumn("hour", F.hour("timestamp")) log_df = log_df.withColumn("day", F.dayofweek("timestamp")) log_df = log_df.withColumn("week", F.weekofyear("timestamp")) log_df = log_df.withColumn("month", F.month("timestamp")) log_df = log_df.withColumn("year", F.year("timestamp")) log_df = log_df.withColumn("weekday", F.dayofweek("timestamp")) time_table = log_df.select(start_time, hour, day, week, month, year, weekday) # write time table to parquet files partitioned by year and month time_table.write.partitionBy( "year", "month").parquet(f"{output_data}time_table.parquet") # read in song data to use for songplays table song_data = input_data + "data/song_data/song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table time_table.createOrReplaceTempView("time_table") log_df.createOrReplaceTempView("log_df_table") song_df.createOrReplaceTempView("song_df_table") songplays_table = spark.sql(""" SELECT DISTINCT lt.start_time, lt.userId, lt.level, lt.sessionId, lt.location, lt.userAgent, st.song_id, st.artist_id, tt.year, tt.month FROM log_df_table lt INNER JOIN song_df_table st ON st.artist_name = lt.artist INNER JOIN time_table tt ON tt.start_time = lt.start_time """) # create songplays_id columns which increases monotonically songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy( "year", "month").parquet(f"{output_data}songplays_table.parquet")
def process_log_data(spark, input_data, output_data): """Process log_data Process log_data from the input path(local or S3) Create dimension tables for users, time and songplays fact table Args: spark: SparkSession input_data: Input Path (local or S3) output_data: Output path (local or S3) Returns: None """ print("Loading log_data") # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") print("Loaded %d log actions " % df.count()) # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level","ts") \ .orderBy("ts",ascending=False) \ .dropDuplicates(subset=["userId"]) \ .drop('ts') print("Writing users data") # write users table to parquet files users_table.write.mode("overwrite").parquet( os.path.join(output_data, "users/")) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(int(x) / 1000)), TimestampType()) # build start_time column and year and month for partitioning df = df.withColumn("start_time", get_timestamp(df.ts)) \ .withColumn("month", month(col("start_time"))) \ .withColumn("year", year(col("start_time"))) # extract columns to create time table time_table = df.select("start_time", "month", "year").dropDuplicates() \ .withColumn("hour", hour(col("start_time"))) \ .withColumn("day", dayofmonth(col("start_time"))) \ .withColumn("week", weekofyear(col("start_time"))) \ .withColumn("weekday", dayofweek(col("start_time"))) print("Writing time data") # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite") \ .partitionBy("year", "month") \ .parquet(os.path.join(output_data, 'time/')) # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs/')).withColumnRenamed( "year", "song_year") print("Joining logs with songs data") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, (df.artist == song_df.artist_name) & (df.song == song_df.title) & (df.length == song_df.duration), how='inner') print("Loaded %d songplays " % songplays_table.count()) songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) print("Writing songplays data") # write songplays table to parquet files partitioned by year and month songplays_table.selectExpr( [ "songplay_id", "start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionId as session_id", "location", "userAgent as user_agent", "year", "month" ]) \ .write.mode("overwrite") \ .partitionBy("year", "month") \ .parquet(output_data + "songplays/")
def process_log_data(spark, input_data, output_data): """ Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables and then again loaded back to S3. Also output from previous function is used in by spark.read.json command Parameters: spark : Spark Session input_data : location of log_data json files with the events data output_data : S3 bucket were dimensional tables in parquet format will be stored """ # get filepath to log data file log_data = os.path.join(input_data,"log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays songplays_table = df['ts', 'userId', 'level','sessionId', 'location', 'userAgent'] # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = users_table.dropDuplicates(['userId']) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') print("users.parquet completed") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x)/1000))) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select( col('datetime').alias('start_time'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year') ) time_table = time_table.dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'time.parquet'), 'overwrite') print("time.parquet completed") # read in song data to use for songplays table song_data = os.path.join(input_data, "song-data/A/A/A/*.json") song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, song_df.title == df.song) songplays_table = df.select( col('ts').alias('ts'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('ssessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), month('datetime').alias('month') ) songplays_table = songplays_table.selectExpr("ts as start_time") songplays_table.select(monotonically_increasing_id().alias('songplay_id')).collect() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays.parquet completed") print("process_log_data completed")
def process_log_data(spark, input_data, output_data): """ Process the log dataset and create user table, time table and songsplat table :param spark: SparkSession :param input_data: path/to/files to process :param output_data: path/to/files to write the results Datasets """ # get filepath to log data file print('Reading Log Data') log_data = input_data + 'log_data/*.json' # read log data file df = spark.read.json(log_data).dropDuplicates() # filter by actions for song plays df = df.filter(df.page == 'NextSong').cache() # rename fields fields = [("artist", "artist"), ("auth", "auth"), ("firstName", "first_name"), ("gender", "gender"), ("itemInSession", "itemInSession"), ("lastName", "last_name"), ("length", "length"), ("level", "level"), ("location", "location"), ("method", "method"), ("page", "page"), ("registration", "registration"), ("sessionId", "session_id"), ("song", "song"), ("status", "status"), ("ts", "ts"), ("userAgent", "user_agent"), ("userId", "user_id") ] exprs = [ "{} as {}".format(field[0],field[1]) for field in fields] df = df.selectExpr(*exprs) # extract columns for users table print('Creating Users Table') users_columns = ['user_id', 'first_name', 'last_name', 'gender', 'level'] users_table = df.selectExpr(users_columns).distinct() # write users table to parquet files print('Writing to artists table') user_data_path = os.path.join(output_data, "users/") users_table.write.parquet(user_data_path, mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000).strftime('%Y-%m-%d %H:%M:%S')) df = df.withColumn('timestamp', get_timestamp('ts')) #df = df.withColumn('start_time', get_datetime('ts')) # extract columns to create time table print('Creating Time Table') time_table = df.select( col('timestamp').alias('start_time'), hour('timestamp').alias('hour'), dayofmonth('timestamp').alias('day'), weekofyear('timestamp').alias('week'), month('timestamp').alias('month'), year('timestamp').alias('year'), date_format('timestamp', 'u').alias('weekday')).orderBy("start_time").drop_duplicates() # write time table to parquet files partitioned by year and month print('Writing to time table') time_data_path = os.path.join(output_data, "time/") time_table.write.partitionBy("year", "month").parquet(time_data_path, mode="overwrite") # read in song data to use for songplays table print('Reading Songs Table') #song_df = spark.read.parquet(output_data + 'songs/*/*/*.parquet') song_df = spark.read.json(os.path.join(input_data, 'song_data/*/*/*/*.json')).selectExpr("song_id", "title", "artist_id", "artist_name", "year", "duration").drop_duplicates() ''' print('Creating Songs Play Table') songs_logs=df.join(song_df, (df.song == song_df.title)) songplays_table = songs_logs.join(time_table, songs_logs.timestamp == time_table.start_time)\ .drop(songs_logs.year)\ .drop(songs_logs.start_time)\ .withColumn("songplay_id", monotonically_increasing_id()) ''' songplays_table = df.join(song_df, (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration) & (year(df.timestamp) == song_df.year), 'left_outer').select(df.timestamp.alias("start_time"), df.user_id, df.level, song_df.song_id, song_df.artist_id, df.session_id, df.location, df.user_agent, year(df.timestamp).alias('year'), month(df.timestamp).alias('month')).orderBy("start_time", "user_id").withColumn("songplay_id", F.monotonically_increasing_id()) # extract columns from joined song and log datasets to create songplays table songplays_table_col = ['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'] songplays_table = songplays_table.select(songplays_table_col).repartition("year", "month") # write songplays table to parquet files partitioned by year and month print('Writing to songs plays table') songplay_data_path = os.path.join(output_data, "songplay/") songplays_table.write.partitionBy("year", "month").parquet(songplay_data_path, mode="overwrite") print('Done')
def process_log_data(spark, input_data, output_data): ''' Function to read 'log-data' from S3 or local directory, extract 'users', 'time' and 'songplays' tables and load them as parquet files in S3. Each table has its own directory on S3. Parameters: - spark: Spark session object. - input_data: Path to base input data, can be local or S3. - output_data: Path to base output data at S3. Outputs: None ''' # get filepath to log data file log_data = input_data + 'log-data' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df[df['page'] == 'NextSong'] # extract columns for users table users_table = df.select(df.userId.alias('user_id'), \ df.firstName.alias('first_name'), \ df.lastName.alias('last_name'), \ df.gender.alias('gender'), \ df.level.alias('level')).dropDuplicates() # write users table to parquet files users_table.write.mode('append').parquet(output_data + '/users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda d: datetime.fromtimestamp(d / 1000.0), TimestampType()) df = df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda d: datetime.fromtimestamp(d / 1000.0), DateType()) df = df.withColumn('date', get_datetime('ts')) # extract columns to create time table time_table = df.select(df.ts.alias('start_time'), \ hour(df.timestamp).alias('hour'), \ dayofmonth(df.timestamp).alias('day'), \ weekofyear(df.timestamp).alias('week'), \ month(df.timestamp).alias('month'), \ year(df.timestamp).alias('year'), \ dayofweek(df.timestamp).alias('weekday')).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy( ['year', 'month']).mode('append').parquet(output_data + '/time/') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + '/songs/') # extract columns from joined song and log datasets to create songplays table # songplays_table: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent songplays_table_join = df.join(song_df, (df.song == song_df.title) & \ (df.length == song_df.duration)).dropDuplicates() songplays_table = songplays_table_join.select(songplays_table_join.ts.alias('start_time'), \ songplays_table_join.userId.alias('user_id'), \ songplays_table_join.level.alias('level'), \ songplays_table_join.song_id.alias('song_id'), \ songplays_table_join.artist_id.alias('artist_id'), \ songplays_table_join.sessionId.alias('session_id'), \ songplays_table_join.location.alias('location'), \ songplays_table_join.userAgent.alias('user_agent'), \ year(songplays_table_join.timestamp).alias('year'), \ month(songplays_table_join.timestamp).alias('month')).dropDuplicates() songplays_table.withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy( ['year', 'month']).mode('append').parquet(output_data + '/songplays/')
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where("page = 'NextSong'") # extract columns for users table users_table = df.selectExpr( "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level").filter("user_id <> ''").orderBy("user_id").drop_duplicates() # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users')) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = df.select( col('timestamp').alias('start_time'), hour('timestamp').alias('hour'), dayofmonth('timestamp').alias('day'), weekofyear('timestamp').alias('week'), month('timestamp').alias('month'), year('timestamp').alias('year'), date_format( 'timestamp', 'u').alias('weekday')).orderBy("start_time").drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time')) # read in song data to use for songplays table song_df = spark.read.json( os.path.join(input_data, 'song_data/*/*/*/*.json')).selectExpr( "song_id", "title", "artist_id", "artist_name", "year", "duration").drop_duplicates() # extract columns from joined song and log datasets to create songplays table songplays_table = df.join( song_df, (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration) & (year(df.timestamp) == song_df.year), 'left_outer').select(df.timestamp.alias("start_time"), df.userId.alias("user_id"), df.level, song_df.song_id, song_df.artist_id, df.sessionId.alias("session_id"), df.location, df.userAgent.alias("user_agent"), year(df.timestamp).alias('year'), month(df.timestamp).alias('month')).orderBy( "start_time", "user_id").withColumn( "songplay_id", F.monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays'))
def process_log_data(spark, input_data, output_data): """ Function: Extract data from log_data files for user and time tables. From both log_data and song_data files get data for songplays table. Data written into parquet files and load into S3 bucket Prameter list spark: session, spark session has been created. input_data: string of path, a path point to S3 bucket. output_data: string of path, a path point to destination in S3. """ # get filepath to log data file log_data = input_data + 'log_data/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df_actions = df.filter(df.page == 'NextSong')\ .select('ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'length', 'location', 'userAgent') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() users_table.createOrReplaceTempView('users') # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df_actions = df_actions.withColumn('timestamp', get_timestamp(df_actions.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x)/1000))) df_actions = df_actions.withColumn('datetime', get_datetime(df_actions.ts)) # extract columns to create time table time_table = df_actions.select('datetime')\ .withColumn('start_time', df_actions.datetime)\ .withColumn('hour', hour('datetime'))\ .withColumn('day', dayofmonth('datetime'))\ .withColumn('week', weekofyear('datetime'))\ .withColumn('month', month('datetime'))\ .withColumn('year', year('datetime'))\ .withColumn('weekday', dayofweek('datetime'))\ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'time/time.parquet'), 'overwrite') # read in song data to use for songplays table df_songs = spark.read.json(input_data + 'song_data/*/*/*/*.json') # extract columns from joined song and log datasets to create songplays table df_song_actions = df_actions.join(df_songs, (df_actions['artist'] == df_songs['artist_name']) & (df_actions['song'] == df_songs['title']) & (df_actions['length'] == df_songs['duration']),'inner') songplays_table = df_song_actions.select( col('datetime').alias('start_time'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), year('datetime').alias('year'), month('datetime').alias('month') ).withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.createOrReplaceTempView('songplays') time_table = time_table.alias('timetable') songplays_table.write.partitionBy( 'year', 'month' ).parquet(os.path.join(output_data, 'songplays/songplays.parquet'), 'overwrite')
def process_log_data(spark, input_data, output_data): """Obtains log data from json files and processes them using spark, writing data to users_table, time_table, and songplays_table parquet files Keyword arguments spark - spark connection input_data - path to read data from output_data - path to write parquet files to """ # get filepath to log data file log_data = input_data + "/log_data/2018/11/*\.json" # read log data file print("Reading log_data in spark dataframe") df = spark.read.json(log_data) df.printSchema() print(df.count()) # filter by actions for song plays print("Filter on NextSong") df = df.filter(df["page"] == "NextSong") print(df.count()) # extract columns for users table print("extract columns to create users table") users_table = df.select( ["userId", "firstName", "lastName", "gender", "level"]).dropDuplicates() users_table.printSchema() print(users_table.count()) # write users table to parquet files users_table_path = output_data + "/users_table" delete_table_if_exists(users_table_path) print("writing users_table") users_table.write.parquet(users_table_path) # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("timestamp", get_timestamp(df.ts)) timestamp_df = df.select(["timestamp"]).dropDuplicates() timestamp_df.printSchema() print(timestamp_df.count()) # extract columns to create time table time_table = timestamp_df.select( date_format("timestamp", "yyyy-MM-dd HH:mm:ss").alias("start_time"), hour("timestamp").alias("hour"), dayofmonth("timestamp").alias("day"), weekofyear("timestamp").alias("week"), month("timestamp").alias("month"), year("timestamp").alias("year"), dayofweek("timestamp").alias("weekday")) time_table.printSchema() print(time_table.count()) # write time table to parquet files partitioned by year and month time_table_path = output_data + "/time_table" delete_table_if_exists(time_table_path) print("writing time_table") time_table.write.partitionBy("year", "month").parquet(time_table_path) # read in song data to use for songplays table song_table = spark.read.parquet(output_data + "/songs_table") artists_table = spark.read.parquet(output_data + "/artists_table") song_df = song_table.join( artists_table, song_table.artist_id == artists_table.artist_id).drop( song_table.artist_id) print("the join artist and song table") song_df.printSchema() print(song_df.count()) # extract columns from joined song and log datasets to create songplays table df_join_song_df = df.join( song_df, (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration)).dropDuplicates() df_join_song_df = df_join_song_df.withColumn("songplay_id", monotonically_increasing_id()) df_join_song_df.printSchema() print(df_join_song_df.count()) songplays_table = df_join_song_df.select( col("songplay_id"), date_format("timestamp", "yyyy-MM-dd HH:mm:ss").alias("start_time"), col("userId"), col("level"), col("song_id"), col("artist_id"), col("sessionId"), col("artist_location"), col("userAgent")) songplays_table.printSchema() # write songplays table to parquet files partitioned by year and month songplays_table_path = output_data + "/songplays_table" delete_table_if_exists(songplays_table_path) print("writing songplays_table") songplays_table.withColumn("year", year( songplays_table.start_time)).withColumn( "month", month(songplays_table.start_time)).write.partitionBy( "year", "month").parquet(songplays_table_path)
def process_log_data(spark, input_data, output_data): """ This method is in charge of performing the ETL and creation of the tables related to the log_data using parquet, these tables are: users, time and songplays parameters: spark= represent the spark session. input_data = represent the source root path of the data that will be processed output_data = represent the destination root path where the parquet tables will be created. """ # get filepath to log data file log_data = input_data + "/log_data/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = filtered_log_data = df.where("page='NextSong'") # extract columns for users table users_columns = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(users_columns).orderBy(desc("ts")).distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", mode="ignore") # create timestamp column from original timestamp column get_timestamp = udf(lambda epoch_in_millis: int(epoch_in_millis / 1000), LongType()) df = df.withColumn("timestamp", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf( lambda epoch_in_seconds: datetime.fromtimestamp(epoch_in_seconds), TimestampType()) df = df.withColumn("datetime", get_datetime("timestamp")) # extract columns to create time table time_columns = [ col("datetime").alias("start_time"), hour("datetime").alias("hour"), dayofmonth("datetime").alias("day"), weekofyear("datetime").alias("week"), month("datetime").alias("month"), year("datetime").alias("year") ] time_table = df.select(time_columns).distinct() # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time.parquet", mode="ignore", partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.read.json(input_data + "/song_data/*/*/*") # extract columns from joined song and log datasets to create songplays table song_plays_columns = [ monotonically_increasing_id().alias('songplay_id'), get_datetime(get_timestamp("ts")).alias("start_time"), col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent") ] songplays_table = df.join(song_df, df.song == song_df.title, "inner").select(song_plays_columns) # write songplays table to parquet files partitioned by artist_id songplays_table.write.parquet(output_data + "songplays.parquet", mode="ignore", partitionBy="artist_id")
def process_log_data(spark, input_data, output_data): """This function will ingest the data from the log-data path. Also will create the parquets files for the users table and the time table. In the case of the songplays table, it will be joined with the data from the song_path.""" # get filepath to log data file log_data = input_data + "log-data" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col("page") == 'NextSong').filter(df.userId.isNotNull()) # extract columns for users table users_table = df.select( col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level").dropDuplicates() # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + "users") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn("timestamp", get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(col("ts"))) # extract columns to create time table time_table = df.select('timestamp', hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'F').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table tsFormat = "yyyy/MM/dd HH:MM:ss z" songplays_table = song_df.join( df, song_df.artist_name == df.artist).withColumn( "songplay_id", monotonically_increasing_id()).withColumn( 'start_time', to_timestamp( date_format( (col("ts") / 1000).cast(dataType=TimestampType()), tsFormat), tsFormat)).select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file df = spark.read.json(log_data) # filter by actions for song plays songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location', 'userAgent'] # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'].dropDuplicates(['userId']) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') print("users.parquet completed") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select( col('datetime').alias('start_time'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year')) time_table = time_table.dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') print("time.parquet completed") # read in song data to use for songplays table song_data = os.path.join(input_data, "song-data/A/A/A/*.json") song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table df = df.join(song_df, song_df.title == df.song) songplays_table = df.select( col('ts').alias('ts'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('ssessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), month('datetime').alias('month')) songplays_table = songplays_table.selectExpr("ts as start_time") songplays_table.select( monotonically_increasing_id().alias('songplay_id')).collect() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays.parquet completed") print("process_log_data completed")
def main(): # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) # tao timebegin bang ngay hien tai + 1 nga # time_begin = datetime.date.today() + datetime.timedelta(days=1) #20190908 time_begin = datetime.date(2019, 1, 1) # tao timeend bang ngay hien tai + 1 thang - 1 ngay # time_end = time_begin + relativedelta(months=1) - datetime.timedelta(days=1) time_end = datetime.date(2020, 2, 28) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select(df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('id', date_format(df.date, "yyyyMMdd")) \ .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \ .withColumn('ngay_trong_tuan', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \ .withColumn('thang', f.month(df.date)) \ .withColumn('quy', f.quarter(df.date)) \ .withColumn('nam', f.year(df.date)) df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply(frame=data_time, mappings=[("id", "string", "id", "bigint"), ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'), ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"), ("tuan_trong_thang", "int", "tuan_trong_thang", "int"), ("tuan_trong_nam", "int", "tuan_trong_nam", "int"), ("thang", "int", "thang", "int"), ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"), ("date", "date", "ngay", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={"dbtable": "student.time_dim", "database": "student_native_report"}, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/fdfdf", transformation_ctx="datasink4")
def process_log_data(spark, input_data, output_data): """ This function extracts json file for log_data, transform, and save as parquet files to a filepath that has been provided as an arugment. For the purpose of joining tables across the files(song_data and log_data), it extracts the loaded songs tbale which is processed in the process_song_data function. INPUTS: * spark - the spark session to run this function on * input_data - the filepath to the log_data which is resided in S3 bucket * output_data - the filepath to the transformed data which gets stored in another S3 bucket """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file log_df = spark.read.json(log_data) # filter by actions for song plays log_df = log_df.filter(log_df.page == "NextSong") # extract columns for users table users_table = log_df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']) # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + "users") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000, TimestampType()) log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp)) # extract columns to create time table time_table = log_df.withColumn("hour", hour("start_time")) \ .withColumn("day", dayofmonth("start_time")) \ .withColumn("week", weekofyear("start_time")) \ .withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) \ .withColumn("weekday", dayofweek("start_time"))\ .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time") # read in song data to use for songplays table song_df = os.path.join(output_data, 'songs/') song_data = spark.read.parquet(song_df) # extract columns from joined song and log datasets to create songplays table joined_table = log_df.join(song_data, log_df.song == song_data.title, how='inner') songplays_table = joined_table.select("start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),"location",col("userAgent").alias("user_agent"))\ .withColumn("songplay_id", monotonically_increasing_id()) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\ .select("songplay_id", songplays_table.start_time,"user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "songplays")
def process_log_data(spark, input_data, output_data): """Function to read raw data, about logs and songs, in json format from S3 putting the data in the right format (indicated schema). Then, from this first table, the dimensions tables users_table and time_table as well as the fact table songplays_table are created and saved into the specified local folder. Args: spark: SparkSession to handle data using Spark; input_data: general path where the data resides locally; output_data: folder's path where the new tables will be stored. """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table columns_users = [ 'userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level' ] # create users_table and drop duplicated rows users_table = df.selectExpr(columns_users).dropDuplicates() # write users table to parquet files users_table.write.csv(os.path.join(output_data, 'users_table'), mode='overwrite', header=True) # create timestamp column from original timestamp column df = df.withColumn('start_time', F.to_timestamp(df.ts / 1000)) # extract columns to create time table columns_time = [ 'start_time', F.hour('start_time').alias('hour'), F.dayofmonth('start_time').alias('day'), F.weekofyear('start_time').alias('week'), F.month('start_time').alias('month'), F.year('start_time').alias('year'), F.dayofweek('start_time').alias('weekday') ] # create time table and drop duplicated rows time_table = df.select(columns_time).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.csv(os.path.join(output_data, 'time_table'), mode='overwrite', header=True) # read in song data to use for songplays table song_data = os.path.join(input_data, 'song_data') schema = StructType([ StructField('num_songs', IntegerType()), StructField('artist_id', StringType()), StructField('artist_latitude', FloatType()), StructField('artist_longitude', FloatType()), StructField('artist_location', StringType()), StructField('artist_name', StringType()), StructField('song_id', StringType()), StructField('title', StringType()), StructField('duration', DoubleType()), StructField('year', IntegerType()) ]) song_df = spark.read.option("recursiveFileLookup", "true").json(song_data, schema=schema) # extract columns from joined song and log datasets to create songplays table columns_songplay = [ 'start_time', 'userId as user_id', 'level', 'song_id', 'artist_id', 'sessionId as session_id', 'location', 'userAgent as user_agent' ] condition = [ df.song == song_df.title, df.length == song_df.duration, df.artist == song_df.artist_name ] songplays_table = df.join(song_df, on=condition, how='left_outer').selectExpr(columns_songplay) # create songplay_id column songplays_table = songplays_table.withColumn( 'songplay_id', F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) # write songplays table to parquet files partitioned by year and month songplays_table.write.csv(os.path.join(output_data, 'songplays_table'), mode='overwrite', header=True)
def process_log_data(spark, input_data, output_data): """ Process the log data from S3 storage and create the analytical tables, users table, time tables, and songsplay table. This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables (users, time, and songplays), and write it into partitioned parquet files on S3. Args: spark: the spark session input_data: the S3 bucket to read data from output_data: the S3 bucket to write analytics tables to """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # define the log data schema log_data_schema = R([ Fld("artist", Str(), True), Fld("auth", Str(), False), Fld("firstName", Str(), True), Fld("gender", Str(), True), Fld("itemInSession", Int(), False), Fld("lastName", Str(), True), Fld("length", Dbl(), True), Fld("level", Str(), False), Fld("location", Str(), True), Fld("method", Str(), False), Fld("page", Str(), False), Fld("registration", Dbl(), True), Fld("sessionId", Int(), False), Fld("song", Str(), True), Fld("status", Int(), False), Fld("ts", Dbl(), False), Fld("userAgent", Str(), True), Fld("userId", Str(), True) ]) # read log data file df = spark.read.json(log_data, schema=log_data_schema) # filter by actions for song plays df = df.filter(col("page") == "NextSong") # extract columns for users table users_table = (df.filter((col("userID") != "") & (col("userID").isNotNull())).select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), "gender", "level").distinct()) # write users table to parquet files users_table.write.parquet(output_data + "users_table.parquet", mode="overwrite") # create datetime column from original timestamp column df = df.withColumn('start_time', to_timestamp(df['ts'] / 1000)) # extract columns to create time table time_table = (df.withColumn("hour", hour("start_time")).withColumn( "day", dayofmonth("start_time")).withColumn( "week", weekofyear("start_time")).withColumn( "month", month("start_time")).withColumn( "year", year("start_time")).withColumn( "weekday", dayofweek("start_time")).select( "start_time", "hour", "day", "week", "month", "year", "weekday").distinct()) # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time_table.parquet", mode="overwrite", partitionBy=["year", "month"]) # read in song and artist data to use for songplays table song_df = spark.read.parquet(output_data + "songs_table.parquet") artist_df = spark.read.parquet(output_data + "artists_table.parquet") song_df = (song_df.join(artist_df, "artist_id", "full").select("song_id", "title", "artist_id", "name", "duration")) # join the song data with log data and save it in songplays_table variable songplays_table = df.join(song_df, [ df.song == song_df.title, df.artist == song_df.name, df.length == song_df.duration ], "left") # extract columns from joined song and log datasets to create songplays table songplays_table = (songplays_table.withColumn( "songplay_id", monotonically_increasing_id()).select( "songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent"), ).withColumn("year", year("start_time")).withColumn("month", month("start_time"))) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays_table.parquet", mode="overwrite", partitionBy=["year", "month"])
def process_log_data(spark, input_data, output_data): """ reads JSON file from S3, transforms it, and outputs them back to S3 as partitioned parquet files (as fact and dimensional tables) :param spark: an existing SparkSession :param input_data: S3 bucket directory (e.g., "s3a://udacity-dend/") :param output_data: output directory (e.g., 'output/') """ # get filepath to log data file log_data = input_data + 'log_data/2018/11/*.json' # TEST # TEST # log_data = os.path.join(input_data, 'log_data/*/*/*.json') # FINAL # FINAL # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table # dim table: users users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = users_table.dropDuplicates(['userId']) # or, shall I do style from here -- https://spark.apache.org/docs/latest/sql-getting-started.html # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users')) # create timestamp column from original timestamp column get_timestamp = udf(lambda ms: datetime.fromtimestamp(ms / 1000.0). strftime('%Y-%m-%d %H:%M:%S')) df = df.withColumn('start_time', get_timestamp(df.ts)) # create datetime column from original timestamp column # Note from mentor: You can ignore the get_datetime part as the timestamp creation is enough # extract columns to create time table # dim table: time # imported functions: year, month, dayofmonth, hour, weekofyear, date_format time_table = df.select(col('start_time'), hour(df.start_time).alias('hour'), dayofmonth(df.start_time).alias('dayofmonth'), month(df.start_time).alias('month'), year(df.start_time).alias('year'), weekofyear(df.start_time).alias('weekofyear') \ # date_format(df.start_time).alias('date_format') -- column not needed, right? ).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time')) # read in song data to use for songplays table song_df = spark.read.parquet(input_data + 'song_data/A/A/B/*.json') # TEST # TEST # song_df = spark.read.parquet(input_data + 'song_data/*/*/*/*.json') # FINAL # FINAL # extract columns from joined song and log datasets to create songplays table # fact table: songplays # LEFT: df, aka log datasets # RIGHT: song_df, aka song datasets, coming from process_log_data() joint_df = df.join(song_df, (df.artist == song_df.artist_name) & (df.length == song_df.duration) & (df.song == song_df.title), 'left_outer' \ ).dropDuplicates() # extract columns from joint_df songplays_table = joint_df.select(col('start_time'), col('userId').alias('user_id'), df.level, song_df.song_id, song_df.artist_id, col('sessionId').alias('session_id'), df.location, col('userAgent').alias('user_agent'), year('start_time').alias('year'), month('start_time').alias('month') \ ).withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays'))