def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def shared_test_partition_preserving(self, func, preserve, create = None): from pyspark.sql.functions import month from tests.test_data import FORECAST_DATA flintContext = self.flintContext def create_dataframe(): return flintContext.read.pandas(make_pdf(FORECAST_DATA, ["time", "id", "forecast"])) if create is None: create = create_dataframe df_lazy = create() df_eager = create() df_eager.timeSeriesRDD df = create() df_joined = df.leftJoin(df, right_alias="right") df = create() df_cached = df.cache() df_cached.count() df_cached_joined = df_cached.leftJoin(df_cached, right_alias="right") partition_preserving_input_tranforms = [ lambda df: df, lambda df: df.withColumn("f2", df.forecast * 2), lambda df: df.select("time", "id", "forecast"), lambda df: df.filter(month(df.time) == 1) ] order_preserving_input_tranforms = [ lambda df: df.orderBy("time") ] input_dfs = [df_lazy, df_eager, df_joined, df_cached, df_cached_joined] for transform in partition_preserving_input_tranforms: for input_df in input_dfs: self.assert_partition_preserving(transform(input_df), func, preserve) for transform in order_preserving_input_tranforms: for input_df in input_dfs: self.assert_order_preserving(transform(input_df), func, preserve) df_cached.unpersist()
def process_log_data(spark, input_data, output_data): """ Description: This function can be used to read the file in the filepath (log_data) to get the user, time, songplays info and used to populate the users and time dim tables and songplays fact table. Arguments: input_data: the path where the input json files are present. output_data: path where the otuput parquet files are written to Returns: None """ # get filepath to log data file log_data = input_data + "log_data/*.json" # read log data file log_df = spark.read.json(log_data).drop_duplicates() # filter by actions for song plays log_df = log_df.where(col("page") == "NextSong") # extract columns for users table users_table = log_df.withColumn("rn", row_number().over(Window.partitionBy("userId").orderBy(col("ts").desc()))).where(col("rn")==1).\ select(col('userId'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender'), col('level')) # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + 'analytics/users') # create timestamp column from original timestamp column get_timestamp = udf(lambda ms: datetime.utcfromtimestamp(ms / 1000), TimestampType()) log_df = log_df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table time_table = log_df.select('start_time').dropDuplicates().select( 'start_time', hour('start_time').alias('hour'), dayofmonth('start_time').alias('day'), weekofyear('start_time').alias('week'), month('start_time').alias('month'), year('start_time').alias('year'), dayofweek('start_time').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( 'year', 'month').parquet(output_data + 'analytics/time') # read in song data to use for songplays table song_df = spark.read.json(input_data + "song_data/*/*/*/*.json").drop_duplicates() # extract columns from joined song and log datasets to create songplays table songplays_table = log_df.join(song_df,log_df.artist == song_df.artist_name).drop_duplicates().select(monotonically_increasing_id().alias('songplay_id'),\ 'start_time', col('userId').alias('user_Id'), 'level',\ 'song_id','artist_id',\ col('sessionId').alias('session_id'), 'location',\ col('userAgent').alias('user_agent')) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time).select('songplay_id',\ songplays_table.start_time,\ 'user_Id','level','song_id',\ 'artist_id','session_id',\ 'location','user_agent','year',\ 'month').drop_duplicates() # write songplays table to parquet files partitioned by song_id (as year and month are not part of this table) songplays_table.write.mode("overwrite").partitionBy( 'year', 'month').parquet(output_data + 'analytics/songplays')
def main(): spark = SparkSession \ .builder \ .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \ .getOrCreate() # use for only one file # filename = 'chicago_taxi_trips_2016_01.csv' # use for reading all files filename = '*' df = spark.read \ .format('csv') \ .options(header=True, inferSchema=True) \ .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename)) # df.printSchema() # Take a look at the top rows # df.limit(5).toPandas() # Check initial number of records # df.count() df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\ .withColumn('month', month(df.trip_start_timestamp))\ .withColumn('day', dayofmonth(df.trip_start_timestamp))\ .withColumn('hour', hour(df.trip_start_timestamp)) df_features = df_with_hour.select('year', 'month', 'day', 'hour', 'pickup_community_area', 'dropoff_community_area') df_no_nulls = df_features.dropna() # df_no_nulls.count() # Create StringIndexer and fit + transform pickup data pickup_indexer = StringIndexer(inputCol='pickup_community_area', outputCol='pickup_community_area_indexed') pickup_indexer_model = pickup_indexer.fit(df_no_nulls) df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls) # Create StringIndexer and fit + transform dropoff data dropoff_indexer = StringIndexer(inputCol='dropoff_community_area', outputCol='dropoff_community_area_indexed') dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed) df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed) # Create OneHotEncoder and fit + transform pickup & dropoff data encoder = OneHotEncoderEstimator() \ .setInputCols(['hour', 'pickup_community_area_indexed', 'dropoff_community_area_indexed']) \ .setOutputCols(['hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded']) encoder_model = encoder.fit(df_dropoff_indexed) df_encoded = encoder_model.transform(df_dropoff_indexed) # df_encoded.printSchema() bucket = output_conf['s3_bucket'] key = output_conf['s3_model_key'] # save the pickup stringINdexer and model pickup_indexer_name = 'pickup_indexer_name' pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name) pickup_indexer.write().overwrite().save(pickup_indexer_path) pickup_indexer_model_name = 'pickup_indexer_model_name' pickup_indexer_model_name_path = os.path.join(bucket, key, pickup_indexer_model_name) pickup_indexer_model \ .write() \ .overwrite() \ .save(pickup_indexer_model_name_path) # save the dropoff stringINdexer and model dropoff_indexer_name = 'dropoff_indexer_name' dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name) dropoff_indexer.write().overwrite().save(dropoff_indexer_path) dropoff_indexer_model_name = 'dropoff_indexer_model_name' dropoff_indexer_model_name_path = os.path.join(bucket, key, dropoff_indexer_model_name) dropoff_indexer_model \ .write() \ .overwrite() \ .save(dropoff_indexer_model_name_path) # save the one-hot encoder and model encoder_name = 'encoder_name' encoder_name_path = os.path.join(bucket, key, encoder_name) encoder.write().overwrite().save(encoder_name_path) encoder_model_name = 'encoder_model_name' encoder_model_name_path = os.path.join(bucket, key, encoder_model_name) encoder_model.write().overwrite().save(encoder_model_name_path) # make final dataframe and store back to S3 df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded', 'pickup_community_area_encoded', 'dropoff_community_area_encoded') bucket = output_conf['s3_bucket'] key = output_conf['s3_data_key'] output_path = os.path.join(bucket, key) df_final.write.partitionBy('year', 'month', 'day') \ .parquet(output_path, mode='overwrite')
#add new column new_column = df.withColumn("Continent",) #rename column name rename= df.withColumnRenamed("first","first_name") #string manypulations---->>>don't forget to from pyspark.sql import functions df.select(functions.upper(df.country)).show() df.select(functions.split('email','@')) #concate string df.select(functions.concat_ws(':','country','first')).collect() #extract a perticular year ,date ,time from a column df.select(functions.year('created_at')).show() df.select(functions.month('created_at')).show() #filter data df.filter(col('email').contains('@gmail.com')).show() df.filter('country'== 'Switzerland').show() df.filter(col('country').isin("'Switzerland'")).show() df.filter(col('first').like('T%')).show() df.filter(col('id').between(1,10)).show() #some dataframe api df.select('country').sort('country').show()
def month(self) -> "ks.Series": """ The month of the timestamp as January = 1 December = 12. """ return column_op(lambda c: F.month(c).cast(LongType()))( self._data).alias(self._data.name)
def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: df = df.withColumn("Date", df.Date.cast(T.DateType())) return (df.withColumn("Year", F.year(df.Date)).withColumn( "Month", F.month(df.Date)).withColumn("Week", F.weekofyear(df.Date)).withColumn( "Day", F.dayofmonth(df.Date)))
def process_log_data(spark, input_data, output_data): """ Load data from S3 bucket for song dataset extract columns for song and artist table and write to parquet files which are saved in S3 :param spark: spark session object :param input_data: Path to S3 bucket with song/artist data :param output_data: output S3 bucket where parquet files are saved :return: """ # get filepath to log data file log_data = os.path.join(input_data, LOG_DATA_FILES) # read log data file actions = spark.read.json(log_data) print("Number of rows in action data: %s" % actions.count()) # filter by actions for song plays actions = actions.filter(actions.page == "NextSong") print("Filtered rows in action data: %s" % actions.count()) # extract columns for users table users_table = actions.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() print("users_table writing to parquet") # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: str(int(int(ts) / 1000))) actions = actions.withColumn('timestamp', get_timestamp(actions.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda ts: str(datetime.fromtimestamp(int(ts) / 1000))) actions = actions.withColumn('datetime', get_datetime(actions.ts)) # extract columns to create time table time_table = actions.select('datetime').withColumn( 'start_time', actions.datetime).withColumn('hour', hour('datetime')).withColumn( 'day', dayofmonth('datetime')).withColumn( 'week', weekofyear('datetime')).withColumn( 'month', month('datetime')).withColumn( 'year', year('datetime')).withColumn( 'weekday', dayofweek('datetime')).dropDuplicates() print("time_table writing to parquet") # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time'), 'overwrite') print("reading files for songs df") # read in song data to use for songplays table songs = spark.read.json(os.path.join(input_data, SONG_DATA_FILES)) # extract columns from joined song and log datasets to create songplays table joined_actions = actions.join(songs, songs.title == actions.song) songplays_table = joined_actions['datetime', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'] songplays_table.select( monotonically_increasing_id().alias('songplay_id')).collect() print("songplays_table writing to parquet") # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, input_data, output_data): """ Description: This function reads the data from S3 and extracts the following two tables: 1. users table 2. time table 3. songplay table Parameters: spark : Spark Session from function (create_spark_session) input_data : S3 location of log data files with the songs metadata. Files are in json format output_data : S3 bucket where dimensional tables are stored in parquet format """ # get filepath to log data file log_data = input_data+'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data, inferSchema=true) # filter by actions for song plays df = df.filter(df.page == 'NextSong') ###Users Table # extract columns for users table users_table =["userdId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"] # write users table to parquet files users_table.write.parquet(output_data + 'users/') ###Time Table # create timestamp column from original timestamp column get_timestamp = udf(date_convert, TimestampType()) df = df.withColumn("start_time", get_datetime('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: to_date(x), TimestampType()) df = df.withColumn("start_time", get_timestamp(col("ts"))) df = df.withColumn("hour", hour("timestamp")) df = df.withColumn("day", dayofmonth("timestamp")) df = df.withColumn("month", month("timestamp")) df = df.withColumn("year", year("timestamp")) df = df.withColumn("week", weekofyear("timestamp")) df = df.withColumn("weekday", dayofweek("timestamp")) time_table = df.select(col("start_time"), col("hour"), col("day"), col("week"), \ col("month"), col("year"), col("weekday")).distinct() # extract columns to create time table time_table = # write time table to parquet files partitioned by year and month songs_table.write.partitionBy("year", "month").parquet(output_data + 'time/') ###SONGPLAYS # read in song data to use for songplays table songs_df = spark.read.parquet(output_data + 'songs/*/*/*') artists_df = spark.read.parquet(output_data + 'artists/*') songs_logs_df = df.join(songs_df, (df.song == songs_df.title)) artists_songs_logs_df = songs_logs_df.join(artists_df, (songs_logs_df.artist == artists_df.name)) # extract columns from joined song and log datasets to create songplays table songplays_df = artists_songs_logs_df.join( time_table, artists_songs_logs_df.ts == time_table.start_time, 'left' ).drop(artists_songs_logs_df.year) # write songplays table to parquet files partitioned by year and month songplays_table = songplays_df.select( col('start_time').alias('start_time'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), col('month').alias('month'), ).repartition("year", "month") songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): """ Description: Loads log_data from S3 bucket, processes it by extracting the songplays fact table along with user, time and song dimension tables, and then loads it back to S3 Parameters: spark: cursor object (SparkSession) input_path: path to the S3 bucket containing log_data output_path: path to S3 bucket where the dimensional tables will be stored in parquet format Returns: None """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # log_data = input_data + "log-data-unzipped/*.json" for using data locally # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users_table', mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn('start_time', get_timestamp(df.ts)) # create datetime column from original timestamp column get_date = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) df = df.withColumn('date', get_date(df.ts)) # extract columns to create time table time_table = df.select('start_time').withColumn('year', year(col('start_time'))) \ .withColumn('month', month(col('start_time'))) \ .withColumn('week', weekofyear(col('start_time'))) \ .withColumn('weekday', date_format(col('start_time'),'E')) \ .withColumn('day', dayofmonth(col('start_time'))) \ .withColumn('hour', hour(col('start_time'))) \ .dropDuplicates() # dayofweek vs date_format ref: https://stackoverflow.com/questions/25006607/how-to-get-day-of-week-in-sparksql # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + 'time_table', mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" # "song_data/A/A/A/*.json" for sample data # song_data = input_data + "song-data-unzipped/song_data/*/*/*/*.json" for using data locally song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table song_df.createOrReplaceTempView('song_df') df.createOrReplaceTempView('log_df') time_table.createOrReplaceTempView('time_table') songplays_table = spark.sql("""SELECT DISTINCT t.start_time, t.year as year, t.month as month, l.userId, l.level, s.song_id, s.artist_id, l.sessionid, s.artist_location, l.useragent FROM song_df s JOIN log_df l ON s.artist_name = l.artist AND s.title = l.song AND s.duration = l.length JOIN time_table t ON t.start_time = l.start_time """).dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + 'songplays_table', mode='overwrite', partitionBy=['year', 'month'])
StructField("hashTags", ArrayType(StringType()), True), StructField("lang", StringType(), True), StructField("text", StringType(), True), StructField("createdAt", LongType(), True) ]) (spark.readStream.table("tweets.`bronze`").withColumn( "json", from_json(col("tweet"), schema)).filter( col("json.id").isNotNull()).withColumn( "hashtag", explode("json.hashTags")).withColumn( "hashtag", lower(col("hashtag"))).withColumn( "createdAt", (col("json.createdAt").cast(LongType()) / 1000).cast(TimestampType())).withColumn( "year", year(col("createdAt"))).withColumn( "month", month(col("createdAt"))).withColumn( "day", dayofmonth(col("createdAt"))).select( "json.id", "json.user", "hashtag", "json.lang", "json.text", "createdAt", "year", "month", "day").writeStream.format("delta").option( "checkpointLocation", silverCheckpointPath). outputMode("append").queryName(silverStreamName).table("tweets.`silver`")) # COMMAND ---------- # %sql select * from tweets.`silver` order by createdAt desc limit 10; # COMMAND ----------
def process_log_data(spark, input_data, output_data): """Process log data with spark and store output. Keyword arguments: spark -- spark session object input_data -- filepath to input data files output_data -- filepath to store output data files """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.select('*').where(df['page'] == 'NextSong') # extract columns for users table users_table = df.select(df['userId'].alias('user_id'), \ df['firstName'].alias('first_name'), \ df['lastName'].alias('last_name'), \ df['gender'], \ df['level']).distinct() # write users table to parquet files users_table.write.parquet(path=output_data + 'users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000, IntegerType()) df = df.withColumn('start_time', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: from_unixtime(x), TimestampType()) df = df.withColumn('datetime', from_unixtime('start_time')) # extract columns to create time table time_table = df.select('start_time', \ hour('datetime').alias('hour'), \ dayofmonth('datetime').alias('day'), \ weekofyear('datetime').alias('week'), \ month('datetime').alias('month'), \ year('datetime').alias('year'), \ date_format('datetime', 'u').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.parquet(path = output_data + 'time/', \ partitionBy = ('year', 'month')) # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df['song'] == song_df['title']) \ .select(monotonically_increasing_id().alias('songplay_id'), 'start_time', year('datetime').alias('year'), month('datetime').alias('month'), df['userId'].alias('user_id'), 'level', 'song_id', 'artist_id', df['sessionId'].alias('session_id'), 'location', df['userAgent'].alias('user_agent')) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(path = output_data + 'songplays/', \ partitionBy = ('year', 'month'))
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('dates').getOrCreate() df = spark.read.csv('appl_stock.csv', header=True, inferSchema=True) print(df.select(['Date', 'Open']).show()) # Operowanie na datach from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format) df.select(dayofmonth(df['Date'])).show() df.select(hour(df['Date'])).show() df.select(month(df['Date'])).show() # Średnia zamykająca cena na rok df.select(year(df['Date'])).show() newdf = df.withColumn("Year", year(df['Date'])) result = newdf.groupBy("Year").mean().select(["Year", "avg(Close)"]) new = result.withColumnRenamed("avg(Close)", "Average Closing Price") new.select( ['Year', format_number('Average Closing Price', 2).alias("Avg Close")]).show()
def process_log_data(spark, input_data, output_data): """ Description: This function reads the log_data from S3 and processes it by using Spark and then loads the resulting tables onto S3 in parquet format. Parameters: @input: spark - Spark Session @input: input_data - location of song_data files @input: output_data - S3 location where ouput files are stored """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level') \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('firstName', 'first_name') \ .withColumnRenamed('lastName', 'last_name') users_table = users_table.dropDuplicates(['user_id']) # write users table to parquet files users_table.write.parquet(output_data + "users/", 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) df = df.withColumn("start_time", get_datetime(df.ts)) # extract columns to create time table df = df.withColumn('hour', hour('timestamp')) df = df.withColumn('day', dayofmonth('timestamp')) df = df.withColumn('week', weekofyear('timestamp')) df = df.withColumn('month', month('timestamp')) df = df.withColumn('year', year('timestamp')) df = df.withColumn('weekday', dayofweek('timestamp')) time_table = df.select('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday') time_table = time_table.dropDuplicates(['start_time']) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(output_data + "time/", 'overwrite') # read in song data to use for songplays table song_data = input_data + "song_data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.artist == song_df.artist_name, how = 'left') \ .select('start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent') \ .withColumn('songplay_id', monotonically_increasing_id()) \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('sessionId', 'session_id') \ .withColumnRenamed('userAgent', 'user_agent') \ .withColumn('year', year('start_time')) \ .withColumn('month', month('start_time')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( output_data + 'songplays/', 'overwrite')
## collecting date for partitioning date = data.select(f.max('nav_date')).collect()[0][0] year = date.year month = date.month if month < 10: month = '0' + str(month) part_date = str(year) + str(month) ## extracting weekday column using date data = data.withColumn('week_day', f.date_format('nav_date', 'E')) sqlctxt.sql("set hive.exec.dynamic.partition.mode=nonstrict") funds = data.filter(f.month('nav_date') == 4).select('fund_id').distinct() funds.registerTempTable("fund_id") #sqlctxt.sql("insert overwrite table h011gtcsandbox.xnd_pricing_fund_id PARTITION (YYYYMM = " + str(part_date) + ") select * from fund_id") funds = funds.select(f.collect_set('fund_id')).collect()[0][0] funds = [str(funds) for funds in funds] data_without_funds = data.where(~data.fund_id.isin(funds)) funds_miss = data.filter((data.fund_id == '2DEC')) data_with_funds = data.where(data.fund_id.isin(funds)) data = data_with_funds.unionAll(funds_miss) ## function to convert weekday from string to numerical EX: Monday as 0 so on def week_day(x):
# Transform transformed_temp_df = cleaned_temp_df\ .select("dt", "AverageTemperature", "AverageTemperatureUncertainty", "City", "Country", "Latitude", "Longitude")\ .withColumn("dt", udf_parse_datetime("dt"))\ .withColumnRenamed("AverageTemperature", "avg_temp")\ .withColumnRenamed("AverageTemperatureUncertainty", "avg_temp_uncertainty")\ .withColumn("city_code", udf_map_country("country"))\ .withColumnRenamed("City", "city")\ .withColumnRenamed("Country", "country")\ .withColumnRenamed("Latitude", "latitude")\ .withColumnRenamed("Longitude", "longitude")\ .withColumnRenamed("dt", "date_time")\ .withColumn('month', month('date_time')) \ .withColumn('year', year('date_time')) \ transformed_temp_df = transformed_temp_df.filter(transformed_temp_df.city_code != 'null') # Write transformed_temp_df.write\ .partitionBy("city_code", "year", "month")\ .mode("append")\ .parquet("{}/transformed/temperature/".format(s3_bucket_name))
def process_i94_data(spark, input_data, output_data, local): """ - Reads i94 fact data from filepath - Converts datetime formats to date - Reads and joins mappings for countries, visa_categories and us_states - Cleans i94addr and age columns - Creates duration measure - Creates date dimension table - Writes data to output location in parquet file format: - i94, visa_categories, us_states, dates :params spark: spark session :params input_data: list of filepaths, file order must match execution steps :params output_data: filepath """ ### read i94 data df_spark = get_i94_data(spark, input_data[0]) if local: df_spark = df_spark.limit(300) ### Datetime conversions # register udfs udf_date_from_sas = udf(lambda x: convert_sas_date(x), DateType()) udf_date_from_str = udf(lambda x: convert_str_to_date(x), DateType()) # add date columns df_spark = df_spark\ .withColumn("arrival_date", udf_date_from_sas("arrdate")) \ .withColumn("departure_date", udf_date_from_sas("depdate")) \ .withColumn("dtadfile_date", udf_date_from_str("dtadfile")) ### i94cit/res number to iso-code mapping # read country data df_con = get_country_mapping(spark, input_data[1]) # join i94cit joinExpr = [df_spark.i94cit == df_con.i94_code] df_spark =\ df_spark.join(df_con.select("i94_code","iso_code"), joinExpr, "left_outer")\ .withColumn("cit_country_id", coalesce("iso_code", lit(99))).drop("i94_code","iso_code") # join i94res joinExpr = [df_spark.i94res == df_con.i94_code] df_spark =\ df_spark.join(df_con.select("i94_code","iso_code"), joinExpr, "left_outer")\ .withColumn("res_country_id", coalesce("iso_code", lit(99))).drop("i94_code","iso_code") ### Visatype to visa_id mapping # read visa df_visa = get_visa_mapping(spark, input_data[2]) joinExpr = [df_spark.visatype == df_visa.visa] df_spark =\ df_spark.join(df_visa.select("visa","visa_id").dropna(), joinExpr, "left_outer")\ .withColumn("visa_id", coalesce("visa_id", lit(1))).drop("visa") ### Clean i94addr - US-States df_states = get_us_states_mapping(spark, input_data[3]) joinExpr = [df_spark.i94addr == df_states.state_id] df_spark = \ df_spark.join(df_states.select("state_id").dropna(), joinExpr, "left_outer")\ .withColumn("state_id_clean", coalesce("state_id", lit(99))).drop("state_id") ### Clean i94mode - replace nulls with 9 not reported df_spark = df_spark.fillna({"i94mode": 9}) ### Clean i94visa - travel purpose - replace nulls with 9 not reported df_spark = df_spark.fillna({"i94visa": 9}) ### Clean gender udf_clean_gender = udf(lambda x: clean_gender(x), StringType()) df_spark = df_spark.withColumn("gender_clean", udf_clean_gender("gender")) ### Clean Age and register udf clean_age = udf(lambda x: clean_negative_age(x), IntegerType()) df_spark = df_spark.withColumn("age", clean_age("i94bir")) ### Create time dimension df_dates = df_spark.select(col("arrival_date").alias("date")).dropDuplicates().dropna() \ .withColumn("year", year("date")) \ .withColumn("month", month("date")) \ .withColumn("day", dayofmonth("date")) \ .withColumn("week", weekofyear("date")) # Calculate duration in days departure - arrival df_spark = df_spark.withColumn("duration", datediff("departure_date", "arrival_date")) ### Select final fields for fact table df_spark = df_spark \ .withColumn("i94_id", monotonically_increasing_id()) \ .select( "i94_id", "cit_country_id", "res_country_id", col("state_id_clean").alias("state_id"), col("i94mode").alias("mode_id"), col("i94visa").alias("purpose_id"), "visa_id", "arrival_date", col("cicid").alias("cic_id"), col("gender_clean").alias("gender"), "count", "duration", "age", col("i94yr").alias("year"), col("i94mon").alias("month") ) ### Write out df_spark.write.parquet(output_data + "i94.parquet", mode="append", partitionBy=['year', 'month']) df_visa.write.parquet(output_data + "visa_categories.parquet", mode="overwrite") df_states.write.parquet(output_data + "us_states.parquet", mode="overwrite") df_dates.write.parquet(output_data + "dates.parquet", mode="append", partitionBy=['year', 'month']) if local: df_spark.write.csv(output_data + "i94.csv", header=True, mode="overwrite", sep=";") df_visa = df_visa.repartition(1) df_visa.write.csv(output_data + "visa_categories.csv", header=True, mode="overwrite", sep=";") df_states = df_states.repartition(1) df_states.write.parquet(output_data + "us_states.parquet", mode="overwrite") df_states.write.csv(output_data + "us_states.csv", header=True, mode="overwrite", sep=";") df_dates = df_dates.repartition(1) df_dates.write.csv(output_data + "dates.csv", header=True, mode="overwrite", sep=";")
def process_log_data(spark, input_data, output_data): """Processs logs data and create fact table and save it to s3""" log_data = input_data + "log_data/*/*/*" log_data = "s3a://data-cap/log_data/*/*/*" df = spark.read.format("json").load(log_data) df = df.where(df.page == "NextSong") def get_ts(x): """udf to convert ts to datetime""" return datetime.fromtimestamp(x/1000) get_time_stamp = udf(get_ts, TimestampType()) df = df.withColumn('start_time', get_time_stamp('ts')) df = df.withColumn("songplay_id", F.monotonically_increasing_id()) df.createOrReplaceTempView("log_data") df.show() # create uses table and write to s3 users_table = df.select( F.col("userid").alias("user_id"), F.col("firstName").alias("first_name"), F.col("lastName").alias("last_name"), F.col("gender").alias("gender"), F.col("level").alias("level") ).distinct() users_table.write.parquet(output_data + "users_table", mode='overwrite') users_table.show() # Create time_table time_table = df.select("start_time", F.hour("start_time").alias('hour'), F.dayofmonth("start_time").alias('day'), F.weekofyear("start_time").alias('week'), F.month("start_time").alias('month'), F.year("start_time").alias('year'), F.date_format("start_time","u").alias('weekday') ).distinct() # Write time_table to s3 time_table.write.partitionBy("year", "month").parquet(output_data + "time_table", mode='overwrite') # read songs data from s3 song_df = spark.read.parquet(output_data + "songs_table") song_df.createOrReplaceTempView("songs_table") song_df.show() # read artists data from s3 artist_df = spark.read.parquet(output_data + "artists_table") artist_df.createOrReplaceTempView("artists_table") artist_df.show() # Create a time_table view for exploratory time_table.createOrReplaceTempView("time_table") time_table.show() # Create the fact table by joining logs, songs and artist tables songplays_table = spark.sql(""" SELECT log.start_time, log.userid, log.level, art.artist_id, song.song_id, log.sessionid, log.location, log.useragent FROM log_data log JOIN artists_table art ON (log.artist = art.artist_name) JOIN songs_table song ON (song.artist_id = art.artist_id)""") # songplays_table.write.partitionBy("userid").parquet(output_data + "songplays_table", mode='overwrite') songplays_table.show() print(f"number of records in songplays_table {songplays_table.count()}")
def process_log_data(spark, input_data, output_data): """Imports the log data. Generates user table, time table, and songplay table and saves them to parquest files.""" # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # define log_data_schema log_data_schema = StructType([ StructField('artist', StringType(), True), StructField('auth', StringType(), True), StructField('firstName', StringType(), True), StructField('gender', StringType(), True), StructField('itemInSession', IntegerType(), True), StructField('lastName', StringType(), True), StructField('length', FloatType(), True), StructField('level', StringType(), True), StructField('location', StringType(), True), StructField('method', StringType(), True), StructField('page', StringType(), True), StructField('registration', FloatType(), True), StructField('sessionId', IntegerType(), True), StructField('song', StringType(), True), StructField('status', StringType(), True), StructField('ts', LongType(), True), StructField('userAgent', StringType(), True), StructField('user_id', IntegerType(), True) ]) # read log data file df = spark.read.json(log_data, log_data_schema) # filter by actions for song plays df = df.filter(df.page == "NextSong") #create a spark sql view of the log data df.createOrReplaceTempView("df_log_data") # extract columns for users table users_table = spark.sql("SELECT DISTINCT user_id, firstName, lastName, \ gender, level FROM df_log_data") # write users table to parquet files users_table.write.mode('overwrite').parquet( os.path.join(output_data, "users_table")) # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), TimestampType()) df = df.withColumn("start_time", get_timestamp('ts')) # set df_log_data table to new newly modified version df.createOrReplaceTempView("df_log_data") # extract columns to create time table using pyspark sql functions time_table = spark.sql("SELECT DISTINCT start_time FROM df_log_data") time_table = time_table.withColumn("hour", hour("start_time")) \ .withColumn("day", dayofmonth("start_time")) \ .withColumn("week", weekofyear("start_time")) \ .withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) \ .withColumn("weekday", date_format('start_time','E')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month")\ .mode('overwrite').parquet(os.path.join(output_data, "timetable")) # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, "songtable")) song_df.createOrReplaceTempView("songs_table") artist_df = spark.read.parquet(os.path.join(output_data, "artists_table")) artist_df.createOrReplaceTempView("artists_table") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql( "SELECT ts, user_id, level, songs_table.song_id, \ songs_table.artist_id, sessionid, df_log_data.location, userAgent\ FROM df_log_data \ JOIN songs_table \ ON df_log_data.song = songs_table.title\ JOIN artists_table \ ON df_log_data.artist = artists_table.name") songplays_table = songplays_table.withColumn("start_time", get_timestamp('ts')) songplays_table = songplays_table.withColumn("month", month("start_time")) \ .withColumn("year", year("start_time")) # drop ts column songplays_table.drop("ts") # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").mode('overwrite')\ .parquet(os.path.join(output_data, "songplays_table"))
def select_range_time(df,day_ini,day_fin): df_ret=df.filter(year("created_at")>=day_ini.year).filter(month("created_at")>=day_ini.month).filter(dayofmonth("created_at")>=day_ini.day) df_ret=df_ret.filter(year("created_at")<=day_fin.year).filter(month("created_at")<=day_fin.month).filter(dayofmonth("created_at")<=day_fin.day) return(df_ret)
def process_log_data(spark, input_data, output_data): """ In this function we are loading the song_data file and create tables for songplays,users and time tables. Input: Sparksession, Input_data filepath for songs data Output_data filepath for songs data Output: We produce parquet files for songplays,users and time tables. """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(col("page") == "NextSong") # extract columns for users table users_table = df['userId', 'firstName', 'lastName', 'gender', 'level'] #drop duplicates users_table = users_table.drop_duplicates(subset=['userId']) # write users table to parquet files users_table = users_table.write.partitionBy('userId').parquet( os.path.join(output_data, 'users.parquet'), 'overwrite') print("users_table partitioned!") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: tstodatetime(x)) df = df.withColumn('daytime', get_timestamp(col("ts"))) # extract columns to create time table time_table = df.select( col("ts").alias('start_time'), year('daytime').alias('year'), month('daytime').alias('month'), dayofmonth('daytime').alias('day'), hour('daytime').alias('hour'), weekofyear('daytime').alias('weekofyear')) #We are going to partition later in the code! # read in song data to use for songplays table sqlContext = SQLContext(spark) songs_table = sqlContext.read.parquet( 'data/outputs/song_data/songs.parquet') # extract columns from joined song and log datasets to create songplays table songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location', 'userAgent', 'song'] #add artists id and song id by joining with songs_table songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\ .select(col('s.ts').alias('start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('e.artist_id').alias('artist_id'), col('e.song_id').alias('song_id')) #add month and year for partitioning later based on those time_table_short = time_table['start_time', 'month', 'year'] songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\ .select(col('s.start_time'), col('s.userId'), col('s.level'), col('s.sessionId'), col('s.location'), col('s.userAgent'), col('s.song'), col('s.artist_id'), col('s.song_id'), col('t.year'), col('t.month'), ) # write time table to parquet files partitioned by year and month time_table = time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'times.parquet'), 'overwrite') print("time_table partitioned!") # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.partitionBy( 'year', 'month').parquet(os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays_table partitioned!")
def process_log_data(spark, input_data, output_data): """ Process the Log dataset of files in JSON format and create the users, time and songplays dimension tables in Spark space. Next tables are writen in parquet format to output_data location. :param: spark: a sparkSession object :param input_data: The URI or local location of input datasets :param output_data: the URI of S3 bucket or local location for the output files """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter("page = 'NextSong'") # extract columns for users table users = df.select(["ts", "userId", "firstName", "lastName", "gender", "level"]) \ .withColumnRenamed("userId", "user_id") \ .withColumnRenamed("firstName", "first_name") \ .withColumnRenamed("lastName", "last_name") users.createOrReplaceTempView("users") users_table = spark.sql( """ SELECT DISTINCT user_id, first_name, last_name, gender, level FROM users WHERE user_id is NOT NULL """ ) # write users table to parquet files users_table.write.parquet(output_data + "users", mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf(lambda x: date.fromtimestamp(x / 1000.0), DateType()) df = df.withColumn("date", get_datetime("ts")) # extract columns to create time table time_table = df.select("start_time", hour("date").alias("hour"), dayofmonth("date").alias("day"), weekofyear("date").alias("week"), month("date").alias("month"), year("date").alias("year"), dayofweek("date").alias("weekday") ).distinct() # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time", mode='overwrite', partitionBy=["year", "month"] ) # read in song data to use for songplays table song_df = df.select("artist", "song", "length", "page", "start_time", "userId", "level", "sessionId", "location", "userAgent", month("date").alias("month"), year("date").alias("year"), ) # extract columns from joined song and log datasets to create songplays table song_df.createOrReplaceTempView("staging_events") songplays_table = spark.sql( """ SELECT row_number() OVER (PARTITION BY start_time ORDER BY start_time) as songplay_id, e.start_time, e.userId AS user_id, e.level AS level, s.song_id AS song_id, s.artist_id AS artist_id, e.sessionId AS session_id, e.location AS location, e.userAgent AS user_agent, e.year, e.month FROM staging_events e LEFT JOIN staging_songs s ON e.song = s.title AND e.artist = s.artist_name AND e.length = s.duration """ ) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays", mode='overwrite', partitionBy=["year", "month"] )
# There’s an API named agg(*exprs) that takes a list of column names and expressions for the type of aggregation you’d like to compute. # You can leverage the built-in functions that mentioned above as part of the expressions for each column. # Provide the min, count, and avg and groupBy the location column. Diplay the results agg_df = df.groupBy("location").agg(F.min("id"), F.count("id"), F.avg("date_diff")) display(agg_df) # COMMAND ---------- # DBTITLE 1,I’d like to write out the DataFrames to Parquet, but would like to partition on a particular column. # You can use the following APIs to accomplish this. # Ensure the code does not create a large number of partition columns with the datasets otherwise the overhead of the metadata can cause significant slow downs. # If there is a SQL table back by this directory, you will need to call refresh table <table-name> to update the metadata prior to the query. df = df.withColumn('end_month', F.month('end_date')) df = df.withColumn('end_year', F.year('end_date')) df.write.partitionBy("end_year", "end_month").parquet("/tmp/sample_table") display(dbutils.fs.ls("/tmp/sample_table")) # COMMAND ---------- # DBTITLE 1,How do I properly handle cases where I want to filter out NULL data? null_item_schema = StructType([ StructField("col1", StringType(), True), StructField("col2", IntegerType(), True) ]) null_df = spark.createDataFrame([("test", 1), (None, 2)], null_item_schema) display(null_df.filter("col1 IS NOT NULL")) # COMMAND ----------
def process_log_data(spark, input_data, output_data): """ Function to load source data and process the data. In the below function we are processing the 'log_data' and creating our Fact table: songplays and also dimension tables: time and users in parquet format. :param spark: SparkSession object :param input_data: Source data (log_data) :param output_data: Data destination :return: None """ # load data into dataframe log_data = input_data + "log_data/*/*" df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users_table users_table = (df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender').alias('gender'), col('level').alias('level')).distinct()) # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + '/users') # create timestamp column from original timestamp column df = df.withColumn( "ts_timestamp", F.to_timestamp( F.from_unixtime((col("ts") / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) def get_weekday(date): import datetime import calendar date = date.strftime("%m-%d-%Y") # , %H:%M:%S month, day, year = (int(x) for x in date.split('-')) weekday = datetime.date(year, month, day) return calendar.day_name[weekday.weekday()] udf_week_day = udf(get_weekday, T.StringType()) # extract columns to create time table time_table = (df.withColumn("hour", hour(col("ts_timestamp"))).withColumn( "day", dayofmonth(col("ts_timestamp"))).withColumn( "week", weekofyear(col("ts_timestamp"))).withColumn( "month", month(col("ts_timestamp"))).withColumn( "year", year(col("ts_timestamp"))).withColumn( "weekday", udf_week_day(col("ts_timestamp"))).select( col("ts_timestamp").alias("start_time"), col("hour"), col("day"), col("week"), col("month"), col("year"), col("weekday"))) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "/time") # read in song data to use for songplays table songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*")) songs_logs = df.join(songs_df, (df.song == songs_df.title)) # extract columns from joined song and log datasets # to create songplays table artists_df = spark.read.parquet(os.path.join(output_data, "artists")) artists_songs_logs = songs_logs.alias('a').join( artists_df.alias('t'), (songs_logs.artist == artists_df.name) | (songs_logs.location == artists_df.location), 'left') songplays = artists_songs_logs.join( time_table, artists_songs_logs.ts_timestamp == time_table.start_time, 'left') # write songplays table to parquet files partitioned by year and month songplays_table = songplays.select( col('start_time'), col('userId').alias('user_id'), col('level'), col('song_id'), col('artist_id'), col('sessionId').alias('session_id'), col('a.location'), col('userAgent').alias('user_agent'), col('year'), col('month'), ).distinct().repartition("year", "month") songplays_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + '/songplays')
def process_log_data(spark, input_data, output_data): """ Processes a log file. Writes time, users and songplay tables to S3. Arguments: input_data -- input S3 directory with `song` and `log` files output_data -- output S3 directory """ print("Read log data") # read log data file df_log_data = spark.read.json(input_data + "log-data/*/*/*.json") # filter by actions for song plays df_log_data = df_log_data[df_log_data['page'] == 'NextSong'] # extract columns for users table users_table = df_log_data[[ 'userId', 'firstName', 'lastName', 'gender', 'level' ]].drop_duplicates() print("Write...") # write users table to parquet files users_table.write.save(path=output_data + 'users_table', format='parquet', mode='overwrite') df_log_data = df_log_data.withColumn('timestamp', F.from_unixtime(df_log_data['ts']/1000))\ .withColumn('hour', F.hour(F.col('timestamp')))\ .withColumn('day', F.dayofmonth(F.col('timestamp')))\ .withColumn('month', F.month(F.col('timestamp')))\ .withColumn('year', F.year(F.col('timestamp')))\ .withColumn('weekofyear', F.weekofyear(F.col('timestamp')))\ .withColumn('dayofweek', F.dayofweek(F.col('timestamp'))) # extract columns to create time table time_table = df_log_data[[ 'timestamp', 'hour', 'day', 'month', 'year', 'weekofyear', 'dayofweek', ]].drop_duplicates() print("Write...") # write time table to parquet files partitioned by year and month time_table.write.save(path=output_data + 'time_table', format='parquet', mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table df_song = spark.read.json(input_data + "song_data/*/*/*/*.json", schema=build_song_schema()) # extract columns from joined song and log datasets to create songplays table songplays_table = df_log_data.join(df_song, on = (df_song['title'] == df_log_data['song']) & \ (df_song['artist_name'] == df_log_data['artist']) & \ (df_song['duration'] == df_log_data['length']) ) print("Write...") # write songplays table to parquet files partitioned by year and month songplays_table.write.save(path=output_data + 'songplays_table', format='parquet', mode='overwrite', partitionBy=['year', 'month'])
def process_log_data(spark, input_data, output_data): """ Extract raw log data, transform into tables for users, songplays, and time, and save into parquet files. Arguments: spark: The SparkSession object input_data: Path to input data where log_data is placed output_data: Output path where songplays.parquest, users.parquet and time.parquet will be saved """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*.json') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level') \ .dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/users.parquet', mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(x))) df = df.withColumn('datetime', get_datetime(df.timestamp)) # extract columns to create time table time_table = df.withColumn('hour', hour('datetime')) \ .withColumn('day', dayofmonth('datetime')) \ .withColumn('week', weekofyear('datetime')) \ .withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .withColumn('weekday', date_format('datetime', 'E')) \ .select(['ts', 'hour', 'day', 'week', 'month', 'year', 'weekday']) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month') \ .parquet(output_data + 'time/time.parquet', mode='overwrite') # read in song data to use for songplays table spark.read.json(os.path.join(input_data, 'song_data')) \ .createOrReplaceTempView('songs') # extract columns from joined song and log datasets to create songplays table df.withColumn('month', month('datetime')) \ .withColumn('year', year('datetime')) \ .createOrReplaceTempView('log_data') songplays_table = spark.sql(""" SELECT l.ts as start_time, l.userId as user_id, l.level, s.song_id, s.artist_id, l.sessionId as session_id, l.location, l.userAgent as user_agent, l.year, l.month FROM log_data l LEFT JOIN songs s ON s.artist_name = l.artist AND s.title = l.song """) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month') \ .parquet(output_data + 'songplays/songplays.parquet', mode='overwrite')
def process_log_data(spark, input_data, output_data): """ Reads log data from S3 and writes results back to S3 as parquet files. :param spark: a sparkSession object :param input_data: S3 bucket/directory for song data json files :param output_data: S3 bucket where result parquet files are written :returns: Nothing """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file and filter by actions for songplays df = spark.read.json(log_data) df = df.filter(df.page == 'NextSong') # extract columns for users table # write users table to parquet files window = Window\ .partitionBy('userId')\ .orderBy(df['ts'].desc()) users_table = df.withColumn('order_users', F.rank().over(window))\ .filter('order_users=1')\ .select('userId', 'firstName', 'lastName', 'gender', 'level') users_table\ .write\ .format('parquet')\ .save(output_data + "users_table.parquet") # create timestamp column from original timestamp column # extract columns to create time table # write time table to parquet files partitioned by year and month time_table = df.select('ts').distinct() time_table = time_table.withColumn('start_time', F.from_unixtime(time_table['ts']/1000))\ .withColumn('year', F.year('start_time'))\ .withColumn('month', F.month('start_time'))\ .withColumn('day_of_month', F.dayofmonth('start_time'))\ .withColumn('day_of_week', F.dayofweek('start_time'))\ .withColumn('week', F.weekofyear('start_time'))\ .withColumn('hour', F.hour('start_time')) time_table\ .write\ .partitionBy('year', 'month')\ .format("parquet")\ .save(output_data + "time_table.parquet") # read in song data to use for songplays table song_data = input_data + 'song_data/*/*/*/*.json' songs_df = spark.read.json(song_data) # join song and log datasets to create songplays table and extract columns # write songplays table to parquet files partitioned by year and month joinExpression = [ df.song == songs_df.title, df.length == songs_df.duration, df.artist == songs_df.artist_name ] joinType = "inner" songplays_table = df.join(songs_df, joinExpression, joinType)\ .select('ts', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent') songplays_table = songplays_table.withColumn('year', F.year(F.to_timestamp(time_table['ts']/1000)))\ .withColumn('month', F.month(F.to_timestamp(time_table['ts']/1000)))\ .withColumn('songplay_id', F.monotonically_increasing_id()) songplays_table\ .write\ .partitionBy('year', 'month')\ .format('parquet')\ .save(output_data + 'songplays_table.parquet')
os.getenv("HOME") + "/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar") conf.set("spark.executor.extrajavaoptions", "-Xmx15000m") conf.set("spark.executor.memory", "15g") conf.set("spark.driver.memory", "15g") conf.set("spark.storage.memoryFraction", "0") spark = SparkSession.builder \ .config(conf=conf) \ .master("local") \ .appName("SAIDI Calculator") \ .getOrCreate() config = open('config.yaml') config = yaml.load(config) #connect to the database pw_df = spark.read.jdbc( "jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch", "pw_dedupe", properties={ "user": config['user'], "password": config['password'], "driver": "org.postgresql.Driver" }) #read the data that we care about pw_df = pw_df.select(pw_df['core_id'], pw_df['time'], pw_df['product_id']) pw_df = pw_df.filter("product_id = 7008 OR product_id= 7009") pw_df.groupBy(month("time")).agg(countDistinct('core_id')).show()
def test_month(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.month(f.col('a'))))
def process_log_data(spark, input_data, output_data): """ Load and extract song and artist data from source data and save them back to S3 param spark : the Spark Session param input_data : the source location of song_data param output_data : The destination where the results are saved """ # get filepath to log data file log_data = os.path.join(input_data,"log_data/*/*/*.json") # read log data file df = spark.read.json(log_data).dropDuplicates() # filter by actions for song plays df = df.filter(df.page == 'NextSong').dropDuplicates() # created log view df = df.withColumn('songplay_id', monotonically_increasing_id()) df.createOrReplaceTempView("log_data_table") # extract columns for users table users_table = spark.sql(""" SELECT DISTINCT userT.userId AS user_id, userT.firstName AS first_name, userT.lastName AS last_name, userT.gender AS gender, userT.level AS level FROM log_data_table userT WHERE userT.userId IS NOT NULL """) # write users table to parquet files #users_table.write.mode('overwrite').parquet(output_data + 'users_table/') users_table.write.parquet(os.path.join(output_data,"users_table"), mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("hour", hour(get_timestamp(df.ts))) \ .withColumn("day", dayofmonth(get_timestamp(df.ts))) \ .withColumn("week", weekofyear(get_timestamp(df.ts))) \ .withColumn("month", month(get_timestamp(df.ts))) \ .withColumn("year", year(get_timestamp(df.ts))) \ .withColumn("weekday", dayofweek(get_timestamp(df.ts))) \ # extract columns to create time table time_table = df.select(["ts", "hour", "day", "week", "month", "year", "weekday"]).withColumnRenamed("ts", "start_time") # write time table to parquet files partitioned by year and month #time_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'time_table/') time_table.write.partitionBy("year", "month").parquet(os.path.join(output_data,"time_table"), mode="overwrite") # created song view song_data_view = input_data + "song_data/*/*/*/*.json" Song_df = spark.read.json(song_data_view).dropDuplicates() Song_df.createOrReplaceTempView("song_data_table") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT logT.songplay_id AS songplay_id, to_timestamp(logT.ts/1000) AS start_time, month(to_timestamp(logT.ts/1000)) AS month, year(to_timestamp(logT.ts/1000)) AS year, logT.userId AS user_id, logT.level AS level, songT.song_id AS song_id, songT.artist_id AS artist_id, logT.sessionId AS session_id, logT.location AS location, logT.userAgent AS user_agent FROM log_data_table logT INNER JOIN song_data_table songT ON logT.artist = songT.artist_name \ AND logT.song = songT.title \ AND logT.length =songT.duration """) # write songplays table to parquet files partitioned by year and month #songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/') songplays_table.write.partitionBy("year", "month").parquet(os.path.join(output_data,"songplays_table"), mode="overwrite")
# In[30]: # 2. Add a date column orders_2 = orders_1.withColumn('Date',convertToDate(orders_1['OrderDate'])) # In[31]: orders_2.show(2) # In[32]: # 3. Add month and year #orders_3 = orders_2.withColumn('Month',getMonth(orders_2['Date'])).withColumn('Year',getYear(orders_2['Date'])) orders_3 = orders_2.withColumn('Month',F.month(orders_2['Date'])).withColumn('Year',F.year(orders_2['Date'])) orders_3 = orders_2.withColumn('Month',getM(orders_2['Date'])).withColumn('Year',getY(orders_2['Date'])) # In[33]: orders_3.show(5) # In[34]: # 3. How many orders by month/year ? import time start_time = time.time() orders_3.groupBy("Year","Month").sum('Total').show() print "%s Elapsed : %f" % (datetime.today(), time.time() - start_time)
collectibles_df.createOrReplaceTempView("collectibles") collectibles_df = spark.sql( "SELECT ROW_NUMBER() OVER(ORDER BY Collectible) as Id, * FROM collectibles" ) # Generate dim_Time feed combined_timestamp = source_glasses.select("timestamp") \ .union(source_report.select("timestamp")) \ .union(source_smartphone.select("timestamp")) \ .union(source_smartwatch.select("timestamp")) time_df = combined_timestamp.select("timestamp") \ .where(col("Timestamp").isNotNull()) \ .distinct() \ .orderBy("timestamp") time_df = time_df.withColumn("Year", year(time_df["timestamp"])) \ .withColumn("Month", month(time_df["timestamp"])) \ .withColumn("Day", dayofmonth(time_df["timestamp"])) \ .withColumn("Hour", hour(time_df["timestamp"])) \ .withColumn("Minute", minute(time_df["timestamp"])) \ .withColumn("Second", second(time_df["timestamp"])) # prepare glasses activities glasses_activities_acc_x = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_X").alias("Collectible"), "timestamp", "ACC_X"]) glasses_activities_acc_y = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Y").alias("Collectible"), "timestamp", "ACC_Y"]) glasses_activities_acc_z = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Z").alias("Collectible"), "timestamp", "ACC_Z"])
) \ .withColumn('eyesopen', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyesopen'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('mouthopen', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].mouthopen'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .drop('ts') \ .withColumnRenamed('n_ts', 'ts') \ .withColumn('year', F.year('ts')) \ .withColumn('month', F.month('ts')) ## Sometimes we need to distribute the data based on a specific column, higher cardinality is better. ## To see the number of spark partitions being used: df.rdd.getNumPartitions() df = df.repartition('ts') ## Finally write the data back out to S3 in partitioned Parquet format ## maxRecordsPerFile is recommended over the old method of using coalesce() df \ .withColumn('smiling', df.smile.value) \ .write \ .option('maxRecordsPerFile', 1000) \ .partitionBy('year', 'month', 'smiling') \ .mode('overwrite') \ .parquet('s3://bucket/prefix')
def process_log_data(spark, input_data, output_data): # get filepath to log data file #log_data = os.path.join(input_data,'log_data/2018/11/2018-11-30-events.json') log_data = os.path.join(input_data, "log-data/*/*/*.json") # read log data file print(log_data) df = spark.read.json(log_data) df.printSchema() # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level') # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users_table.parquet') # create timestamp column from original timestamp column get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column df = df.withColumn('datetime', from_unixtime(F.col('ts') / 1000)) # extract columns to create time table time_table = df.select('datetime', \ hour('datetime').alias('hour'), \ dayofmonth('datetime').alias('day'), \ weekofyear('datetime').alias('week'), \ month('datetime').alias('month'), \ year('datetime').alias('year'), \ date_format('timestamp', 'u').alias('weekday'), \ 'ts') # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'time_table.parquet') # read in song data to use for songplays table sc = spark.sparkContext sqlContext = SQLContext(sc) song_table = sqlContext.read.parquet(output_data + 'songs_table.parquet') artists_table = sqlContext.read.parquet(output_data + 'artists_table.parquet') time_table = sqlContext.read.parquet(output_data + 'time_table.parquet') song_table = song_table.withColumnRenamed("artist_id", "artistId") condition = [song_table.artistId == artists_table.artist_id] songs_artists_table = song_table.join(artists_table, condition) songs_artists_table.show(2) condition = [ songs_artists_table.duration == df.length, songs_artists_table.title == df.song, songs_artists_table.artist_name == df.artist ] song_log_data = songs_artists_table.join(df, condition) song_log_data.printSchema() #condition = [song_log_data.ts == time_table.start_time] #song_long_time_data = song_log_data.join(time_table,condition) # extract columns from joined song and log datasets to create songplays table song_log_data = song_log_data.withColumn('datetime', from_unixtime(F.col('ts') / 1000)) songplays_table = song_log_data.select(\ monotonically_increasing_id().alias('songplay_id'),\ 'datetime', \ 'userId', \ 'level', \ 'song_id', \ 'artist_id', \ 'sessionId', \ 'location', \ 'userAgent',\ month('datetime').alias('month'), \ year('datetime').alias('year') ) songplays_table.printSchema() songplays_table.show(2) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'songplays_table.parquet')