def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def process_log_data(spark, input_data, output_data): """ This function process all event logs of the Sparkify app. :param spark: :param input_data: :param output_data: :return: """ # get filepath to log data file log_data = input_data + "log_data/*/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = (df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender').alias('gender'), col('level').alias('level')).distinct()) users_table = users_table.orderBy( "ts", ascending=False).dropDuplicates(subset=["userId"]).drop('ts') # write users table to parquet files #users_table.write.parquet(output_data + "users.parquet", mode="overwrite") users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') # create timestamp column from original timestamp column #df = df = df.withColumn( "ts_timestamp", F.to_timestamp( F.from_unixtime((col("ts") / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) def get_weekday(date): """ This function gets weekday from date :param date: :return weekday: """ import datetime import calendar date = date.strftime("%m-%d-%Y") # , %H:%M:%S month, day, year = (int(x) for x in date.split('-')) weekday = datetime.date(year, month, day) return calendar.day_name[weekday.weekday()] udf_week_day = udf(get_weekday, T.StringType()) # extract columns to create time table time_table = (df.withColumn("hour", hour(col("ts_timestamp"))).withColumn( "day", dayofmonth(col("ts_timestamp"))).withColumn( "week", weekofyear(col("ts_timestamp"))).withColumn( "month", month(col("ts_timestamp"))).withColumn( "year", year(col("ts_timestamp"))).withColumn( "weekday", udf_week_day(col("ts_timestamp"))).select( col("ts_timestamp").alias("start_time"), col("hour"), col("day"), col("week"), col("month"), col("year"), col("weekday")).distinct()) time_table = time_table.drop_duplicates(subset=['start_time']) # write time table to parquet files partitioned by year and month #time_table.write.parquet(output_data + "time.parquet", mode="overwrite") time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = (df.withColumn( "songplay_id", F.monotonically_increasing_id()).join( song_df, song_df.title == df.song).select( "songplay_id", col("ts_timestamp").alias("start_time"), col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent")).distinct()) # write songplays table to parquet files partitioned by year and month #songplays_table.write.parquet(output_data + "songplays.parquet", mode="overwrite") songplays_table.write.parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite')
from pyspark.sql import functions as F from pyspark.sql import types as T from pyspark.sql import Window, Row # File paths source_user_path = "s3://polakowo-yelp2/yelp_dataset/user.json" target_users_path = "s3://polakowo-yelp2/staging_data/users" user_df = spark.read.json(source_user_path) # Drop fields which will be outsourced and cast timestamp field users_df = user_df.drop("elite", "friends")\ .withColumn("yelping_since", F.to_timestamp("yelping_since")) users_df.write.parquet(target_users_path, mode="overwrite")
StructField("tip_amount", DoubleType(), True), \ StructField("tolls_amount", DoubleType(), True), \ StructField("improvement_surcharge", DoubleType(), True), \ StructField("total_amount", DoubleType(), True), \ StructField("congestion_surcharge", DoubleType(), True) ]) df = sc.read.format("csv").options(header='True').schema(schema).load( "../../dan606/nyctaxi/trip data/yellow_tripdata_2019-08.csv") # WARNING "WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException" CAN BE IGNORED df.printSchema() # handle dates AND time df = df.withColumn( 'pickup_time', fun.to_timestamp('tpep_pickup_datetime', "yyyy-MM-dd HH:mm:ss")) df = df.withColumn('pickup_hour', fun.hour("pickup_time")) ## ML: classification with Decision Trees # Predicting the 'payment_type' value from other features of the Taxi data # https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf pred_col = ["trip_distance", "pickup_hour", "passenger_count"] resp_var = 'RateCodeID' # trip categories dffeat = df.na.drop() vector_assembler = VectorAssembler( inputCols=pred_col, outputCol='features') #Create pipeline and pass it to stages pipeline = Pipeline(stages=[vector_assembler]) df_transformed = pipeline.fit(dffeat).transform(dffeat) df_input = df_transformed.select(resp_var, 'features').withColumnRenamed(
print("Streaming DF: " + str(streamingDF.isStreaming)) print("Static DF: " + str(staticDF.isStreaming)) # COMMAND ---------- #2. Create a processing statement ("action") #real-time time difference between arrival_time and creation_time aggregated by user and device. # change format of "Arrival_Time", "Creation_Time" to timestamp from pyspark.sql.functions import col, to_timestamp, datediff streamingDF = streamingDF\ .withColumn("Arrival_Time", col("Arrival_Time").cast("long"))\ .withColumn("Creation_Time", col("Creation_Time").cast("long")) streamingDF = streamingDF\ .withColumn("Arrival_Time2",to_timestamp(streamingDF['Arrival_Time']))\ .withColumn("Creation_Time2",to_timestamp(streamingDF['Creation_Time']))\ .withColumn("time_diff", datediff(col("Arrival_Time2"), col("Creation_Time2")) ) timeDiff = streamingDF.groupBy('User', 'Device').sum('time_diff') # COMMAND ---------- timeDiff.show() # COMMAND ---------- #Set shuffle partitions to a small value to avoid creating too many shuffle partitions spark.conf.set("spark.sql.shuffle.partitions", 5) # COMMAND ----------
spark = SparkSession.builder.config( "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate() df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv") print(df.printSchema()) print(df.show(5)) df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv", sep=';', inferSchema=True, header=True) print(df.printSchema()) print(df.show(5)) df_payment = df.withColumn("payment_date", F.to_timestamp('payment_date')) print(df_payment.printSchema()) print(df_payment.show(5)) df_payment = df_payment.withColumn("month", F.month('payment_date')) print(df_payment.show(5)) df_payment.createOrReplaceTempView("payment") print( spark.sql( "select month, sum(amount) as revenue from payment group by month order by revenue desc" ).show()) #fix schema paymentSchema = StructType([ StructField("payment_id", IntegerType()),
'content_size', F.expr('cast(content_size as int)')) dfClean.limit(5).toPandas() #UDF DATETIME @F.udf(StringType()) def dateTimeUDF(row): pattern_date = '[0-9]{2}/[A-Z][a-z]{2}/[0-9]{4}' pattern_time = '\:[0-9]{2}\:[0-9]{2}\:[0-9]{2}' match_date = re.search(pattern_date, row).group(0) match_time = re.search(pattern_time, row).group(0) date = match_date.replace('Jul', '07') time = match_time[1:] return date + ' ' + time #FORMAT DATE dfClean_Date = dfClean.withColumn('date_time', dateTimeUDF('date_time')) dfClean_Date.limit(5).show() dfClean_Date = dfClean_Date.withColumn( 'date_time', F.to_timestamp('date_time', 'dd/MM/yyyy HH:mm:ss')) dfClean_Date.limit(5).show() dfClean_Date.printSchema()
def run_spark_job(spark): configure_logging(spark) # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port # Ref: https://spark.apache.org/docs/2.2.0/structured-streaming-kafka-integration.html df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "udacity.sf.police.crime.v2") \ .option("startingOffsets", "earliest") \ .option("maxOffsetsPerTrigger", 200) \ .option("stopGracefullyOnShutdown", "true") \ .load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value AS STRING)") # kafka_df.writeStream.format("console").outputMode("append").start() service_table = kafka_df \ .select(psf.from_json(psf.col('value'), schema).alias("DF")) \ .select("DF.*") # service_table.writeStream.format("console").outputMode("append").start() # I have seen there ara some rows with null values in both, original_crime_type # and disposition, so I will filter them out. service_table_non_nulls = service_table.na.drop( subset=["original_crime_type_name", "disposition"]) # TODO select original_crime_type_name and disposition # I'm using pst.to_timestamp to convert the string timestamp into a timestamp object so we can use it later # to do watermarking and windowed aggregations. distinct_table = service_table_non_nulls.select( "original_crime_type_name", "disposition", psf.to_timestamp("call_date_time").alias( "call_date_time_ts")).distinct() # distinct_table.writeStream.format("console").outputMode("append").start() # count the number of original crime type # Nice blog about watermarking # https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html # Discarding events that arrive more than 10 minutes late. I don't want to set a huge watermark to avoid having # memory issues agg_df = distinct_table \ .select("original_crime_type_name", "disposition", "call_date_time_ts") \ .withWatermark("call_date_time_ts", "10 minutes") \ .groupBy("original_crime_type_name", psf.window("call_date_time_ts", "10 minutes", "5 minutes"), "disposition" # Including this field so I can run the aggregation later. ) \ .count() # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation # TODO write output stream query = agg_df \ .writeStream \ .queryName("Original Crime Type Count Aggregation")\ .trigger(processingTime="30 seconds") \ .format('console') \ .option("truncate", "false") \ .start() # TODO attach a ProgressReporter #query.awaitTermination() # TODO get the right radio code json path radio_code_json_filepath = "./radio_code.json" radio_code_df = spark.read. \ option("multiline", "true"). \ json(radio_code_json_filepath, radio_code_schema) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # TODO rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") radio_code_df.printSchema() # TODO join on disposition column # Nice blog on joins: https://luminousmen.com/post/introduction-to-pyspark-join-types # In this case, if we use an inner join unless the disposition is on the radio dataframe we # wont see any results, so I will be using a left join, because I want to see the data on the agg even # if there is no a matching disposition join_query = agg_df.join(radio_code_df, on='disposition', how='left') query_join = join_query \ .writeStream \ .queryName("Join with radio codes")\ .trigger(processingTime="30 seconds") \ .format('console') \ .option("truncate", "false") \ .start() query_join.awaitTermination()
def process_log_data(spark, input_data, output_data): ''' Process the log data from the file(s) specified in the parameters. Args: spark: the spark session input_data: output_data: Returns: modeled data from logs and songs json files that are written to parquet files back on S3 ''' # get filepath to log data file log_data = input_data + "log_data/*/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender').alias('gender'), col('level').alias('level')).distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", mode="overwrite") # create timestamp column from original timestamp column df = df.withColumn( 'timestamp', f.to_timestamp( f.from_unixtime((col('ts') / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) # create datetime column from original timestamp column df = df.withColumn('ts_datetime', f.to_datetime(col['ts']).cast('Datetime')) # extract columns to create time table time_table = df.withColumn("hour", hour(col("timestamp"))) \ .withColumn("day", dayofmonth(col("timestamp"))) \ .withColumn("week", weekofyear(col("timestamp"))) \ .withColumn("month", month(col("timestamp"))) \ .withColumn("year", year(col("timestamp"))) \ .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \ .select( col("timestamp").alias("start_time"), col("hour"), col("day"), col("week"), col("month"), col("year"), col("weekday") ) # write time table to parquet files partitioned by year and month time_table.parquet(output_data + "time.parquet", mode="overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn( 'songplay_id', F.monontonically_increasing_id()).join( song_df, song_df.title == df.song).select( 'songplay_id', col().alias('start_time'), col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', col('sessionId').alias('session_id'), 'location', col('userAgent').alias('user_agent')) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays.parquet", mode="overwrite")
def run_spark_application(): # Creates session and spark context sc = SparkContext(appName="Stocks") spark = SQLContext.getOrCreate(sc) amazonDataFrame = createDataFrame(spark, "amazon.csv") amazonInfo = selectInfoFromDataFrame(amazonDataFrame, "amazon") googDataFrame = createDataFrame(spark, "google.csv") googInfo = selectInfoFromDataFrame(googDataFrame, "google") facebookDataFrame = createDataFrame(spark, "facebook.csv") facebookInfo = selectInfoFromDataFrame(facebookDataFrame, "facebook") # Collect all Date and closing into one dataFrame dataTable = amazonInfo.join( googInfo, amazonInfo.amazonDate == googInfo.googleDate).select( "amazonDate", "closeAmazon", "closeGoogle") dataTable = dataTable.join( facebookInfo, dataTable.amazonDate == facebookInfo.facebookDate).select( dataTable["amazonDate"].alias("date"), "closeAmazon", "closeGoogle", "closeFacebook") # We want to format the data into the format such that first column is all date, second column is symbols and last # column is all about the closing price of that day amazFormatted = selectInfoAsNewNames(dataTable, "amazon") faceBookFormatted = selectInfoAsNewNames(dataTable, "facebook") googFormatted = selectInfoAsNewNames(dataTable, "google") # We union the columns together, then reorder them by dates formattedDataTable = amazFormatted.union(faceBookFormatted).union( googFormatted) formattedDataTable = formattedDataTable.orderBy( formattedDataTable.date.asc()) # We construct the final DataFrame # 1: We add timestamp and price as two new columns based on date and closing Price finalDf = formattedDataTable.withColumn( "timestamp", to_timestamp(formattedDataTable.date)).withColumn( "price", formattedDataTable["closingPrice"].cast("double")) # 2: After that we drop the original price and closingPrice finalDf = finalDf.drop("date", "closingPrice").sort("timestamp") finalDf.registerTempTable("preData") finalDf.show() # We gather the necessary data to create a time series RDD minDate = finalDf.selectExpr( "min(timestamp)").collect()[0]["min(timestamp)"] maxDate = finalDf.selectExpr("max(timestamp)").alias( "timestamp").collect()[0]["max(timestamp)"] frequency = DayFrequency(1, sc) dtIndex = datetimeindex.DateTimeIndex.uniform(start=minDate, end=maxDate, freq=frequency, sc=sc) tsRdd = timeseriesrdd.time_series_rdd_from_observations( dtIndex, finalDf, "timestamp", "symbol", "price") # Last step BRO, we perform the prediction df = tsRdd.map_series(train_transform_func) # Let's avoid the zone check in python here. it is way too annoying if we care about that finalDf.show() spark.stop()
def process_log_data(spark, input_data, output_data): """ Process log_data from input_data path and save users, time and songplays tables in paquet format in output_data path Parameters: spark: SparkSession object to process data input_data: path to input data output_data: path to output data """ # get filepath to log data file log_data = input_data + 'log_data/*' # read log data file log_df = spark.read.json(log_data) # filter by actions for song plays log_df = log_df.filter('page = "NextSong"') \ .withColumn('user_id', log_df['userId'].cast('integer')) \ .withColumn('session_id', log_df['sessionId'].cast('integer')) \ .withColumnRenamed('firstName', 'first_name') \ .withColumnRenamed('lastName', 'last_name') # extract columns for users table users_table = log_df[['user_id', 'first_name', 'last_name', 'gender', 'level']] # write users table to parquet files users_table.where(users_table.user_id.isNotNull()).distinct().write.mode('overwrite').parquet(output_data + 'users/') # create timestamp column from original timestamp column time_df = log_df[['ts']] # create datetime column from original timestamp column time_df = time_df.withColumn('ts', to_timestamp(col('ts')/1000)) # extract columns to create time table time_table = time_df.withColumnRenamed('ts', 'start_time') \ .withColumn('hour', hour(col('start_time'))) \ .withColumn('day', dayofmonth(col('start_time'))) \ .withColumn('week', weekofyear(col('start_time'))) \ .withColumn('month', month(col('start_time'))) \ .withColumn('year', year(col('start_time'))) \ .withColumn('weekday', date_format(col('start_time'), 'u').cast('integer')) # write time table to parquet files partitioned by year and month time_table.distinct().write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/') # read in song data to use for songplays table song_df = spark.read.json(input_data + 'song_data/*/*/*') # extract columns from joined song and log datasets to create songplays table songplays_table = log_df.join(song_df, [log_df.song == song_df.title, log_df.artist == song_df.artist_name]) \ .selectExpr('monotonically_increasing_id() as songplay_id', \ 'to_timestamp(ts/1000) as start_time', \ 'month(to_timestamp(ts/1000)) as month', \ 'year(to_timestamp(ts/1000)) as year', \ 'user_id as user_id', \ 'level as level', \ 'song_id as song_id', \ 'artist_id as artist_id', \ 'session_id as session_id', \ 'location as location', \ 'userAgent as user_agent') # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data+'songplays/')
import pyspark.sql.functions as F import pyspark.sql.types as T spark = SparkSession.builder.enableHiveSupport()\ .config("spark.sql.parquet.writeLegacyFormat",True)\ .getOrCreate() df = spark.read.csv( "hdfs://hive-namenode:8020/user/sqoop/order_detail/part-m-00000", header=False) rename = { '_c0': 'order_created_timestamp', '_c1': 'status', '_c2': 'price', '_c3': 'discount', '_c4': 'id', '_c5': 'driver_id', '_c6': 'user_id', '_c7': 'restaurant_id', } df = df.toDF(*[rename[c] for c in df.columns]) df = df.withColumn('order_created_timestamp', F.to_timestamp('order_created_timestamp')) df = df.withColumn('dt', F.date_format('order_created_timestamp', "yyyyMMdd")) df = df.withColumn('price', F.col('price').cast(T.IntegerType())) df = df.withColumn('discount', F.col('discount').cast(T.FloatType())) df.write.parquet( 'hdfs://hive-namenode:8020/user/spark/transformed_order_detail', partitionBy='dt', mode='overwrite')
def summary_df(df,fn): #,max_date): # drop null ad_click values df = df.na.drop(subset=["ad_click"]) # Remove non search sessions df = df[df['ad_click']>0] # sum ad_click sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\ .agg(F.sum("ad_click").alias("ad_click")) # read revenue_by_country rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv" rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True) rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\ .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\ .withColumn("country_code", F.upper(F.col("country_code"))) # add country field and revenue table - need transform to calculate transaction-level monetary value tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer') spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc()) # NOTE partition includes country because client may change country over time no_country=( tbl .where(F.isnull(F.col("yyyymm"))) .withColumn("rev_per_search_float", F.lit(.005)) ) has_country=( tbl .na.drop(subset=["yyyymm"]) .where("yyyyMM_timestamp <= activity_date") .withColumn('rank', F.row_number().over(spec)) .where("rank = 1") .drop('rank') ) tbl2=(no_country.union(has_country)) # drop first purchase to calculate revenue spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1 search_rev = (tbl2 .withColumn("rank", F.row_number().over(spec2)) .where("rank > 1") ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value")) # compute the final dataset for the BG/NBD model dataset = ( tbl2 .groupBy("client_id") .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), (F.countDistinct('activity_date')-1).alias("frequency"), (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"), F.sum("ad_click").alias("historical_searches"), F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv")) .join(search_rev, "client_id", how="left") .where("frequency >= 0 AND recency >= 0 AND T >= 0") .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv") ).fillna(0, subset=['monetary_value']) # anonymize client_id dataset = dataset.withColumn('client_id',sha1(dataset.client_id)) # write dataset recency, freq, age, revenue table per client #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn) duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"]) duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
def drop_minutes(df, colName): fmt = "yyyy-MM-dd HH:00:00" return df.withColumn( colName, to_timestamp(date_format(col(colName), fmt), "yyyy-MM-dd HH:mm:ss"))
def main(): """ Lavora sui dati del database originale per fare ETL e caricare sulla dwh i dati che ci servono """ spark = SparkSession \ .builder \ .appName('DBAnalysis') \ .config('spark.driver.extraClassPath', 'postgresql-42.2.10.jar') \ .getOrCreate() properties = { 'driver': 'org.postgresql.Driver', 'url': 'jdbc:postgresql://postgres:5432/postgres', 'user': '******', 'password': '******', 'dbtable': ' spotify_details', } properties_dwh = { 'driver': 'org.postgresql.Driver', 'url': 'jdbc:postgresql://postgres_dwh:5432/postgres', 'user': '******', 'password': '******' } df2 = spark.read \ .format('jdbc') \ .option('driver', properties['driver']) \ .option('url', properties['url']) \ .option('user', properties['user']) \ .option('password', properties['password']) \ .option('dbtable', properties['dbtable']) \ .load() count_tracks_distribution = df2.groupby('id_playlist')\ .count() # Creo la colonna year_month df3 = df2.withColumn( 'year_month', date_format(to_timestamp(df2.timestamp, "yyyy-MM-dd'T'HH:mm:ssXXX"), "yyyy-MM").alias('year_month')) # Aggrego sulla playlist e conto le ricorrenze df4 = df3.groupby('id_playlist', 'year_month').count() # Per assegnare un mese alla playlist si é deciso di scegliere il mese con maggiori "aggiunte" di canzoni # Per ogni playlist seleziono solo quella con ricorrenze per mese maggiore w = Window.partitionBy('id_playlist') df5 = df4.withColumn('max', f.max('count').over(w))\ .where(f.col('count') == f.col('max'))\ .drop('max', 'count') month_distribution = df5\ .where(f.col('year_month')>="2020-01")\ .groupby('year_month')\ .count() # Il df completo ha l'informazione sul mese di riferimento assegnata ad ogni playlist spotify_complete = df2.join(df5, on=['id_playlist'], how='left') df_complete2 = spotify_complete.groupBy("id_playlist", 'name_playlist' , 'year_month')\ .agg(f.mean('danceability'),f.stddev_pop('danceability'),f.mean('energy'),f.stddev_pop('energy'),f.mean('valence'),f.stddev_pop('valence'))\ .sort('year_month', ascending=True) newColumns = [ "id_playlist", "name_playlist", "year_month", "avgdanceability", "stdddanceability", "avgenergy", "stddenergy", "avgvalence", "stddvalence" ] df_complete2 = df_complete2.toDF(*newColumns) df_complete3 = df_complete2.groupBy('year_month')\ .agg(f.mean('avgdanceability'),f.mean('stdddanceability'),f.mean('avgenergy'),f.mean('stddenergy'),f.mean('avgvalence'),f.mean('stddvalence'))\ .sort('year_month', ascending=True) newColumns = [ "timestamp", "mean_danceability", "stdev_danceability", "mean_energy", "stdev_energy", "mean_valence", "stdev_valence" ] audiofeatures_stat = df_complete3.toDF(*newColumns) audiofeatures_stat.write.jdbc(url=properties_dwh['url'], table='audiofeatures_stat', mode='overwrite', properties=properties_dwh) spotify_complete.write.jdbc(url=properties_dwh['url'], table='spotify_complete', mode='overwrite', properties=properties_dwh) month_distribution.write.jdbc(url=properties_dwh['url'], table='month_distribution', mode='overwrite', properties=properties_dwh) count_tracks_distribution.write.jdbc(url=properties_dwh['url'], table='count_tracks_distribution', mode='overwrite', properties=properties_dwh)
from pyspark.sql import Row import operator from pyspark.sql.window import Window conf = SparkConf().setAppName("Ex2").setMaster("local") sc = SparkContext(conf=conf) spark = SparkSession(sc) df = sc.textFile( '/host/HieldshiemMasters/Semester1/DistributedDataAnalytics/Exercises/Ex9_Solution/ml-10M100K/tags.dat' ).map(lambda x: x.split("::")) df = df.toDF(['UserID', 'MovieID', 'Tag', 'Timestamp']) df_Update = df.withColumn('time_datestring', func.from_unixtime('timestamp')) df_Update = df_Update.withColumn( 'time_date', to_timestamp(df_Update.time_datestring, 'yyyy-MM-dd HH:mm:ss')) #print(df_Update) #df_Update.show() #===== get all the time stamps for each user ======================== #test=df_Update.groupBy(['UserID']) new = df_Update.groupBy(['UserID']).agg(collect_list("time_date")) #test.show() #==========sort time stamps for each user=========================== #func=udf(lambda x:sorted(x.tolist())) def sorter(l): res = sorted(l) return [item for item in res]
def rename_cols(df, mapping): for old, new in mapping.items(): df = df.withColumnRenamed(old, new) return df # COMMAND ---------- match = spark.read.json('dbfs:/mnt/lol/landing/europe/match/') team = spark.read.json('dbfs:/mnt/lol/landing/europe/team/') participant = spark.read.json('dbfs:/mnt/lol/landing/europe/participants/') match = match\ .withColumn('gameCreation', F.col('gameCreation') / 1000)\ .withColumn('gameCreation', F.to_timestamp('gameCreation'))\ .withColumn('gameStartTimestamp', F.col('gameStartTimestamp') / 1000)\ .withColumn('gameStartTimestamp', F.to_timestamp('gameStartTimestamp')) # COMMAND ---------- sum_cols = { 'kills': 'kills', 'assists': 'assists', 'deaths': 'deaths', 'doubleKills': 'double_kills', 'tripleKills': 'triple_kills', 'quadraKills': 'quadra_kills', 'pentaKills': 'penta_kills', 'unkilled': 'unkilled', 'goldEarned': 'gold',
'l_traffic_dl_pktuuloss_tot_qci_8', 'l_e_rab_succest', 'l_thrp_time_cell_ul_highprecision', 'l_e_rab_initsuccest', 'l_e_rab_abnormrel_other_voip', 'l_traffic_dl_pktdelay_num' ] currentTime = datetime.now() lags = currentTime - timedelta(minutes=120) #Change this based on the time zone difference timdedifference = 150 Full_DF = spark.read.format("org.apache.spark.sql.cassandra")\ .options(table='huawei_4g', keyspace='common')\ .load()\ .select(*allCols)\ .withColumn('ossresulttime',func.to_timestamp("ossresulttime", "yyyy-MM-dd HH:mm"))\ .withColumn('ossdate',func.to_timestamp("ossdate", "yyyy-MM-dd"))\ .withColumn('osshour',col('osshour').cast('float'))\ .filter(col("ossresulttime") > datetime.now()+timedelta(minutes=timdedifference)) combined_DF = Full_DF.select(*impCols) maxTime = combined_DF.select(func.max('ossresulttime')).first() combined_DF = combined_DF.filter( combined_DF['ossresulttime'] >= maxTime[0] - timedelta(minutes=15)) combined_DF = combined_DF.na.fill({ 'l_traffic_ul_pktloss_loss_qci_5': 9.32716621103459e-06, 'l_e_rab_attest_qci_2': 5.48632216489652e-05, 'l_thrp_time_dl_qci_2':
print('# Estructura cargada') print('> Cargando modelos de preprocesamiento') # Preprocesamiento columns = dataset.columns[5:] dataset = dataset.na.fill("N/A") # Cast string a tipos adecuados dataset = dataset.withColumn( "timestamp", F.from_unixtime(F.col("timestamp"), "yyyy-MM-dd'T'HH:mm:ssXXX")) dataset = dataset.withColumn( "time", F.to_timestamp(F.col("Time"), 'MM/dd/yyyy hh:mm:ss a')) dataset = dataset.withColumn( "time", F.date_format(F.col("Time"), "yyyy-MM-dd'T'HH:mm:ssXXX")) dataset = dataset.withColumn('First', F.col('First').cast(BooleanType())) dataset = dataset.withColumn('Sequencenum', F.col('Sequencenum').cast(IntegerType())) # Creamos col hora dataset = dataset.withColumn("hour", F.hour(F.col("time"))) # Normalizamos columna hora dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0) * 6)
def process_log_data(spark, input_data, output_data): """ - Processes JSON log files stored in input location - Transforms dimension tables: users, time - Transforms fact tables: songplays - Saves output to parquet files Arguments: spark -- instatiated object for spark session input_data (str) -- path to folder containing log files to be processed output_data (str) -- output path for final parquet files """ print("Log processing : Started") # get filepath to log data file log_data = input_data + 'log_data/' # read log data file df = spark.read.option("recursiveFileLookup", "true").json(log_data) # filter by actions for song plays df = df.filter(col("page") == "NextSong") # extract columns for users table users_table = df.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level").distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( "%Y-%m-%d %H:%M:%S")) df = df.withColumn("timestamp", to_timestamp(get_timestamp("ts"))) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime("%Y-%m-%d")) df = df.withColumn("date", to_date(get_datetime("ts"))) # extract columns to create time table df.createOrReplaceTempView("timetable") time_table = spark.sql(""" SELECT DISTINCT timestamp AS start_time, HOUR(timestamp) AS hour, DAY(timestamp) AS day, WEEKOFYEAR(timestamp) AS week, MONTH(timestamp) AS month, YEAR(timestamp) AS year, DAYOFWEEK(timestamp) AS weekday FROM timetable """) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + "time.parquet", mode="overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") artist_df = spark.read.parquet(output_data + "artists.parquet").selectExpr( "artist_id as ref_artist", "name") song_df = song_df.join(artist_df, song_df.artist_id == artist_df.ref_artist) if song_df.count() > 0: # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df , (df.artist == song_df.name) & (df.song == song_df.title) , how='left')\ .selectExpr("concat_ws('_', userId, ts) as songplay_id", "timestamp as start_time", "userId as user_id", \ "level", "song_id", "artist_id", "sessionId as session_id", "location", "userAgent as user_agent" ) # write songplays table to parquet files partitioned by year and month songplays_table.withColumn("year", year("start_time")).withColumn("month", month("start_time"))\ .write.partitionBy("year", "month")\ .parquet(output_data + "songplays.parquet", mode="overwrite") print("Log processing : Ended")
StructField("machineType", StringType(), True), StructField("deviceId", StringType(), True), StructField("type", StringType(), True), StructField("status", StringType(), True), StructField("timestamp", StringType(), True), ])) dfParsedContents = dfAvroInput.withColumn("Body", from_json(col("Body").cast("string"), contentSchema)) dfExplodedParsedContents = dfParsedContents.withColumn("Body", explode("Body").alias("key")) display(dfExplodedParsedContents) # COMMAND ---------- # DBTITLE 1,Convert EnqueuedTime and unwrap body contents dfFormatedTime = dfExplodedParsedContents.withColumn("EnqueuedTimeUtc", to_timestamp("EnqueuedTimeUtc", 'M/d/yyyy h:mm:ss a')) dfUnwrappedContents = dfFormatedTime.withColumn("EventTimestamp", to_timestamp(col("Body").getItem("timestamp"))) dfUnwrappedContents = dfUnwrappedContents.withColumn("FactoryArea", col("Body").getItem("factoryArea")) dfUnwrappedContents = dfUnwrappedContents.withColumn("MachineType", col("Body").getItem("machineType")) dfUnwrappedContents = dfUnwrappedContents.withColumn("DeviceID", col("Body").getItem("deviceId")) dfUnwrappedContents = dfUnwrappedContents.withColumn("type", col("Body").getItem("type")) dfUnwrappedContents = dfUnwrappedContents.withColumn("Status", col("Body").getItem("status")) dfUnwrappedContents = dfUnwrappedContents.drop("Body") display(dfUnwrappedContents) # COMMAND ---------- # DBTITLE 1,Keep only event information dfAlarmEvents = dfUnwrappedContents.select("EventTimestamp", "FactoryArea", "MachineType", "DeviceID", "type", "Status")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7" tweet_raw = (spark.read.format("csv").options(header="true").load("20200312_Coronavirus_Tweets_Subset.CSV")) display(tweet_raw) tweet_raw.show(10, False) #### CLEAN DATA #### # Initially filter tweets in English & create new filtered DataFrame tweet_filter = tweet_raw.select("*", F.when(tweet_raw.lang == 'en', 'TRUE').alias('eng_true')) tweet_filter = tweet_filter.filter("eng_true == 'TRUE'") # Fix Date Structure tweet_filter = tweet_filter.withColumn('created_at', regexp_replace('created_at', 'T', ' ')) tweet_filter = tweet_filter.withColumn('created_at', regexp_replace('created_at', 'Z', '')) # Convert to Timestamp tweet_filter = tweet_filter.withColumn('dt',to_timestamp(tweet_filter.created_at, 'yyyy-MM-dd HH:mm:ss')) # Drop Unused Columns tweet_filter = tweet_filter.drop('created_at','reply_to_status_id','reply_to_user_id','reply_to_screen_name','place_type','account_lang') # Define Columns for Integer Transformation cols = spark.createDataFrame([('status_id',1),('user_id',2),('favourites_count',3),('retweet_count',4),('followers_count',5),('friends_count',6)]) cols_col = cols.select("_1") tweet_filter.show(10, False) tweet_raw.printSchema() type(tweet_raw) # create a spark session spark = SparkSession.builder\ .master("local")\
# datediff('2016-01-01', '2017-01-01') # FROM dateTable from pyspark.sql.functions import to_date dateFormat = "yyyy-dd-MM" cleanDateDF = spark.range(1).select( to_date(lit("2017-12-11"), dateFormat).alias("date"), to_date(lit("2017-20-12"), dateFormat).alias("date2")) cleanDateDF.createOrReplaceTempView("dateTable2") # -- in SQL # SELECT to_date(date, 'yyyy-dd-MM'), to_date(date2, 'yyyy-dd-MM'), to_date(date) # FROM dateTable2 # in Python from pyspark.sql.functions import to_timestamp cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show() # -- in SQL # SELECT to_timestamp(date, 'yyyy-dd-MM'), to_timestamp(date2, 'yyyy-dd-MM') # FROM dateTable2 # SELECT cast(to_date("2017-01-01", "yyyy-dd-MM") as timestamp) # Spark includes a function to allow you to select the first non-null value from a set of columns by using # the coalesce function. from pyspark.sql.functions import coalesce df.select(coalesce(col("Description"), col("CustomerId"))).show() # deal with null # -- in SQL # SELECT # ifnull(null, 'return_value'),
def process_log_data(spark, input_data, output_data): """ This function takes the log data from Udacity's S3 input file and processes it. This is done by extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS. Parameters: spark : Spark Session input_data : The S3 bucket location of song_data, think 'input' output_data : The S3 bucket location of the song_data, think 'ouput' """ #Using print statement to understand where in spark statement we are print("\n Taking in log data as variable from S3's input location....") # get full filepath to song data file #log_data = input_data + 'log_data/*/*/*.json' #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards) log_data = input_data + 'log_data/2018/11/*.json' #Using print statement to understand where in spark statement we are print("\n Defining log Schema....") log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()), SFld("firstName", Str()), SFld("gender", Str()), SFld("itemInSession", Lng()), SFld("lastName", Str()), SFld("length", Dbl()), SFld("level", Str()), SFld("location", Str()), SFld("method", Str()), SFld("page", Str()), SFld("registration", Dbl()), SFld("sessionId", Lng()), SFld("song", Str()), SFld("status", Str()), SFld("ts", Str()), SFld("userAgent", Str()), SFld("userId", Str())]) #Using print statement to understand where in spark statement we are print("\n Reading log data JSON files from S3's input location....") # read log data file df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Filtering page by NextSong....") # filter by actions for song plays df = df.filter(df.page == 'NextSong').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Creating select statement for users data creation....") # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for users table....") # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users_table/') #Using print statement to understand where in spark statement we are print("\n Creating timeStamp variable....") # create timestamp column from original timestamp column df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000))) #Using print statement to understand where in spark statement we are print("\n Creating select statement for time data creation....") # extract columns to create time table time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \ .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \ .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates() ) #Using print statement to understand where in spark statement we are print("\n Writing parquet file for time table and partitioned by year and month....") # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/') #Using print statement to understand where in spark statement we are print("\n Reading song data JSON files from S3's input location....") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs_table/') #Using print statement to understand where in spark statement we are print("\n Creating select statement for song play data creation....") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \ .select('songplayId', col('timestamp').alias('start_time'), col('userId'), 'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent')) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\ .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for song paly table and partitioned by year and month....") # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
def run_spark_job(spark): # TODO Create Spark Configuration # Create Spark configurations with max offset of 200 per trigger # set up correct bootstrap server and port df = spark \ .readStream \ .format("kafka")\ .option("kafka.bootstrap.servers", "localhost:9092")\ .option("subscribe", "org.sf.crime.calls")\ .option("maxOffsetPerTrigger", 200)\ .option("startingOffsets", "earliest") \ .load() # Show schema for the incoming resources for checks df.printSchema() # TODO extract the correct column from the kafka input resources # Take only value and convert it to String kafka_df = df.selectExpr("CAST(value as STRING)") service_table = kafka_df\ .select(psf.from_json(psf.col('value'), schema).alias("DF"))\ .select("DF.*") # TODO select original_crime_type_name and disposition distinct_table = service_table\ .select( psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time")\ , psf.col("original_crime_type_name")\ , psf.col("disposition") ) # count the number of original crime type agg_df = distinct_table\ .select( distinct_table.call_date_time, distinct_table.original_crime_type_name, distinct_table.disposition)\ .withWatermark("call_date_time","60 minutes")\ .groupBy( psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"), psf.col("original_crime_type_name") )\ .count() # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation # TODO write output stream query = agg_df \ .writeStream\ .format("console")\ .outputMode("Complete")\ .start() # TODO attach a ProgressReporter query.awaitTermination() # TODO get the right radio code json path radio_code_json_filepath = "radio_code.json" radio_code_df = spark.read.json(radio_code_json_filepath) # clean up your data so that the column names match on radio_code_df and agg_df # we will want to join on the disposition code # TODO rename disposition_code column to disposition radio_code_df = radio_code_df.withColumnRenamed("disposition_code", "disposition") # TODO join on disposition column join_query = agg_df\ .join(radio_code_df, "disposition")\ .writeStream\ .format("console")\ .queryName("join_query")\ .start() join_query.awaitTermination()
spark = SparkSession.builder.appName('Packt').getOrCreate() # get the raw data from a local socket raw_stream = spark.readStream.format('socket').option('host', 'localhost').option( 'port', 1234).load() # set up the Twitter date-time format tweet_datetime_format = 'EEE MMM dd HH:mm:ss ZZZZ yyyy' # parse the json to get separate fields tweet_stream = raw_stream.select(from_json('value', schema).alias('tweet')) # create a timestamp by parsing the created_at field timed_stream = tweet_stream.select( to_timestamp('tweet.created_at', tweet_datetime_format).alias('timestamp'), 'tweet.text') # To display the tweets without windowing: # query = timed_stream.writeStream.outputMode('append').format('console').start() # query.awaitTermination() # create a sliding window of 1 minute with a slide of 10 seconds, with a 'slack time' of 2 seconds windowed = timed_stream \ .withWatermark('timestamp', '2 seconds') \ .groupBy(window('timestamp', '1 minute', '10 seconds')) # count the tweets per window counts_per_window = windowed.count().orderBy('window') # output the windows and counts to the console
def process_log_data(spark, input_data, output_data): """ Uses a given Spark session to process log data. :param input_data: Path to a folder or S3 bucket, where the input data lives. :param output_data: Path to a folder or S3 bucket, where the output should be stored. """ # get filepath to log data file log_data = os.path.join(input_data, 'log_data', '*.json') # read log data file logger.debug(f"Read data from {log_data}") df = spark.read.json(log_data) logger.debug('Auto detected JSON schema') df.printSchema() # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table # Columns: user_id, first_name, last_name, gender, level artists_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']] users_table = df.selectExpr([ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ]).dropDuplicates() # write users table to parquet file users_table_fp = os.path.join(output_data, 'dim_user.parquet') logger.debug(f"Write users table: {users_table_fp}") users_table.write.parquet(users_table_fp, mode='overwrite') # create timestamp column from original timestamp column df = df.withColumn("timestamp", F.to_timestamp(df.ts/1000)) # time is in millisecond # create datetime column from original timestamp column df = df.withColumn("datetime", F.to_date(df.timestamp)) # extract columns to create time table # Columns: start_time, hour, day, week, month, year, weekday time_table = df.selectExpr([ "timestamp as start_time", "hour(datetime) as hour", "dayofmonth(datetime) as day", "weekofyear(datetime) as week", "month(datetime) as month", "year(datetime) as year", "dayofweek(datetime) as weekday", ]) # write time table to parquet files partitioned by year and month time_table_fp = os.path.join(output_data, 'dim_time.parquet') logger.debug(f"Write time table: {time_table_fp}") time_table.write.parquet(time_table_fp, mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_table_fp = os.path.join(output_data, 'dim_song.parquet') song_df = spark.read.parquet(song_table_fp) song_df = song_df\ .selectExpr([ 'song_id as song_song_id', 'artist_id as song_artist_id', 'title as song_title' ]) # read in artist data to use for songplays table artist_table_fp = os.path.join(output_data, 'dim_artist.parquet') artist_df = spark.read.parquet(artist_table_fp) artist_df = artist_df.selectExpr([ 'artist_id as artist_artist_id', 'name as artist_name' ]) # extract columns from joined song and log datasets to create songplays table # Columns: songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent songplays_table = df.selectExpr([ 'timestamp as start_time', 'userId as user_id', 'level', 'song', 'artist', 'sessionId as session_id', "location", 'userAgent as user_agent' ])\ .join(song_df, df.song==song_df.song_title, 'left_outer') \ .join(artist_df, df.artist==artist_df.artist_name, 'left_outer') \ .selectExpr([ "start_time", "user_id", "level", "song_song_id as song_id", "artist_artist_id as artist_id", "session_id", "location", "user_agent", "year(start_time) as year", "month(start_time) as month", ]) \ .dropDuplicates() \ .withColumn('songplay_id', F.monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table_fp = os.path.join(output_data, 'fact_songplay.parquet') logger.debug(f"Write songplay table: {songplays_table_fp}") songplays_table.write.parquet(songplays_table_fp, mode='overwrite', partitionBy=['year', 'month'])
else: dados_json = get(url, params={'limit': limit, 'offset': offset}).json() dados_json = dados_json['result'] dados_json = dados_json['records'] # Criando um dataframe com o dicionário anteriormente criado df_json = spark.createDataFrame(dados_json, schema) # Adicionando os dados do dataframe no dataframe vazio anteriormente Criando df_empty = df_empty.unionAll(df_json) # Incrementando a interação interaction += 1 # Incrementando o offset offset = limit * interaction # Alterando os campos de datas para timestamp df = df_empty\ .withColumn('LicenseAddDtTm', to_timestamp('LicenseAddDtTm', 'yyyy-MM-dd HH:mm:ss')) # Particionando dataframe para que grave apenas 10 arquivos df = df.repartition(10) # Gravar os dados no HDFS df \ .write\ .mode("overwrite")\ .option("path", hdfs + 'boston_active_food_establishment')\ .saveAsTable("boston_active_food_establishment") #df\ # .write\ # .mode("overwrite")\ # .option("path",local + 'boston_active_food_establishment')\
def main(args=None): """ Pre process raw input data and save in cleansed state to /Processed directory """ parser = build_parser() args = parser.parse_args(args) #ensure only one sc and spark instance is running global MySpark MySpark = MySpark or MyPySpark(master='local[3]') #make schema int_fields_l = [] str_fields_l = [ "copyright", "description", "end", "f", "geography", "iso3166", "lat", "latlon", "lon", "name", "source", "start", "units", "last_updated" ] str_fields_no_null_l = [ "series_id", ] # timestamp_fields_l = ["last_updated",] str_fields_schema_l = [ StructField(field_name, StringType(), nullable=True) for field_name in str_fields_l ] str_fields_no_null_schema_l = [ StructField(field_name, StringType(), nullable=False) for field_name in str_fields_no_null_l ] array_fields_l = [ StructField("data", ArrayType(ArrayType(StringType()))), ] # timestamp_schema_l = [StructField(field_name, StringType(), nullable=True) for field_name in timestamp_fields_l] electricity_schema = StructType(str_fields_schema_l + str_fields_no_null_schema_l + array_fields_l) #limit for testing electricity_raw_df = MySpark\ .spark\ .read\ .json('/EIAElec/ELEC.json', schema = electricity_schema) electricity_raw_monthly_df = electricity_raw_df\ .filter(pysF.col("f") == 'M') electricity_fact_df = MyPySpark.eia_data_explode( electricity_raw_monthly_df\ .filter(pysF.col("series_id").isNotNull())\ .select( "series_id", "data")) electricity_base_dim_df = electricity_raw_monthly_df\ .drop("data", "latlon")\ .filter(pysF.col("series_id").isNotNull())\ .withColumn( "last_updated", pysF.to_timestamp("last_updated", "yyyy-MM-dd'T'HH:mm:ssXXX"))\ .withColumn( "lat", pysF.col("lat").cast(pysT.DoubleType()) )\ .withColumn( "lon", pysF.col("lat").cast(pysT.DoubleType()) )\ .withColumn( "start", pysF.col("start").cast(pysT.IntegerType()) )\ .withColumn( "end", pysF.col("end").cast(pysT.IntegerType()) )\ .withColumn( "split_name", pysF.split("name", ":") ) power_rows_l = [ "^ELEC\.GEN\.", "^ELEC\.CONS_TOT.", "^ELEC\.CONS_TOT_BTU\.", "^ELEC\.CONS_EG\.", "^ELEC\.CONS_EG_BTU\.", "^ELEC\.CONS_UTO\.", "^ELEC\.CONS_UTO_BTU\." ] plant_level_rows_l = [ "^ELEC\.PLANT\.GEN\.", "^ELEC\.PLANT\.CONS_TOT.", "^ELEC\.PLANT\.CONS_TOT_BTU\.", "^ELEC\.PLANT\.CONS_EG\.", "^ELEC\.PLANT\.CONS_EG_BTU\.", "^ELEC\.PLANT\.CONS_UTO\.", "^ELEC\.PLANT\.CONS_UTO_BTU\.", "^ELEC\.PLANT\.AVG_HEAT\." ] retail_rows_l = [ "^ELEC\.SALES\.", "^ELEC\.REV\.", "^ELEC\.PRICE\.", "^ELEC\.CUSTOMERS\." ] fossil_fuel_rows_l = [ "^ELEC\.STOCKS\.", "^ELEC\.RECEIPTS\.", "^ELEC\.RECEIPTS_BTU\.", "^ELEC\.COST\.", "^ELEC\.COST_BTU\.", ] fossil_fuel_quality_rows_l = [ "^ELEC\.SULFUR_CONTENT\.", "^ELEC\.ASH_CONTENT\." ] electricity_power_dim_df = electricity_base_dim_df\ .filter( pysF.col("series_id").rlike("|".join(power_rows_l)) )\ .withColumn( "value_type", pysF.trim(pysF.col("split_name").getItem(0)) )\ .withColumn( "fuel_type", pysF.trim(pysF.col("split_name").getItem(1)) )\ .withColumn( "region", pysF.trim(pysF.col("split_name").getItem(2)) )\ .withColumn( "sector", pysF.trim(pysF.col("split_name").getItem(3)) )\ .withColumn( "frequency", pysF.trim(pysF.col("split_name").getItem(4)) ) electricity_plant_level_dim_df = electricity_base_dim_df\ .filter( pysF.col("series_id").rlike("|".join(plant_level_rows_l)) )\ .withColumn( "value_type", pysF.trim(pysF.col("split_name").getItem(0)) )\ .withColumn( "plant_name", pysF.trim(pysF.col("split_name").getItem(1)) )\ .withColumn( "fuel_type", pysF.trim(pysF.col("split_name").getItem(2)) )\ .withColumn( "engine_type", pysF.trim(pysF.col("split_name").getItem(3)) )\ .withColumn( "frequency", pysF.trim(pysF.col("split_name").getItem(4)) )\ .withColumn( "plant_id", pysF.regexp_extract(pysF.col("series_id"), r".*\.(\d+)-.*", 1) ) electricity_retail_dim_df = electricity_base_dim_df\ .filter( pysF.col("series_id").rlike("|".join(retail_rows_l)) )\ .withColumn( "value_type", pysF.trim(pysF.col("split_name").getItem(0)) )\ .withColumn( "region", pysF.trim(pysF.col("split_name").getItem(1)) )\ .withColumn( "sector", pysF.trim(pysF.col("split_name").getItem(2)) )\ .withColumn( "frequency", pysF.trim(pysF.col("split_name").getItem(3)) ) electricity_fossil_fuel_dim_df = electricity_base_dim_df\ .filter( pysF.col("series_id").rlike("|".join(fossil_fuel_rows_l)) )\ .withColumn( "value_type", pysF.trim(pysF.col("split_name").getItem(0)) )\ .withColumn( "fuel_type", pysF.trim(pysF.col("split_name").getItem(1)) )\ .withColumn( "region", pysF.trim(pysF.col("split_name").getItem(2)) )\ .withColumn( "sector", pysF.trim(pysF.col("split_name").getItem(3)) )\ .withColumn( "frequency", pysF.trim(pysF.col("split_name").getItem(4)) ) electricity_fossil_fuel_quality_dim_df = electricity_base_dim_df\ .filter( pysF.col("series_id").rlike("|".join(fossil_fuel_quality_rows_l)) )\ .withColumn( "value_type", pysF.trim(pysF.col("split_name").getItem(0)) )\ .withColumn( "quality_type", pysF.trim(pysF.col("split_name").getItem(1)) )\ .withColumn( "fuel_type", pysF.trim(pysF.col("split_name").getItem(2)) )\ .withColumn( "region", pysF.trim(pysF.col("split_name").getItem(3)) )\ .withColumn( "sector", pysF.trim(pysF.col("split_name").getItem(4)) )\ .withColumn( "frequency", pysF.trim(pysF.col("split_name").getItem(5)) )\ #Catch-all for any missed dimensions electricity_missed_dim_df = electricity_base_dim_df\ .filter( ~pysF.col("series_id").rlike("|".join( power_rows_l + plant_level_rows_l + retail_rows_l + fossil_fuel_rows_l + fossil_fuel_quality_rows_l)) ) # save plans to ExplainFiles, write to hdfs, and sync df_l = [ { "df": electricity_fact_df, "description": "preprocess_electricity_facts", "path": "/Processed/ElectricityFactDF" }, { "df": electricity_power_dim_df, "description": "preprocess_electricity_power_dimensions", "path": "/Processed/ElectricityPowerDimDF" }, { "df": electricity_plant_level_dim_df, "description": "preprocess_electricity_plant_level_dimensions", "path": "/Processed/ElectricityPlantLevelDimDF" }, { "df": electricity_retail_dim_df, "description": "preprocess_electricity_retail_dimensions", "path": "/Processed/ElectricityRetailDimDF" }, { "df": electricity_fossil_fuel_dim_df, "description": "preprocess_electricity_fossil_fuel_dimensions", "path": "/Processed/ElectricityFossilFuelDimDF" }, { "df": electricity_fossil_fuel_quality_dim_df, "description": "preprocess_electricity_fossil_fuel_quality_dimensions", "path": "/Processed/ElectricityFossilFuelQualityDimDF" }, { "df": electricity_missed_dim_df, "description": "preprocess_electricity_missed_dimensions", "path": "/Processed/ElectricityMissedDimDF" }, ] for df_d in df_l: #Common formatting if "split_name" in df_d["df"].columns: df_d["df"] = df_d["df"].drop("split_name") df_d["df"] = df_d["df"]\ .replace( { "":None, "null":None }) MySpark.eia_output_df(df_d=df_d, display_output=args.display_test, s3_backup=args.s3)
# MAGIC %sql # MAGIC SET spark.sql.legacy.timeParserPolicy = LEGACY # COMMAND ---------- # MAGIC %md # MAGIC # CORRECT # COMMAND ---------- from pyspark.sql.functions import to_timestamp display( df_date_tmp.withColumn( 'created_at_date', to_timestamp(df_date_tmp.created_at_split, 'dd MMM yyyy HH:mm:ss')).select('created_at_split', 'created_at_date')) # COMMAND ---------- from pyspark.sql.functions import date_format display( df_date.withColumn('created_at_date', date_format(df_date.created_at_split, 'dd')).select('created_at_split', 'created_at_date')) # COMMAND ---------- # MAGIC %sql # MAGIC SELECT date_format('28 Jul 2020 01:29:45')
# COMMAND ---------- from pyspark.sql.functions import to_date dateFormat = "yyyy-dd-MM" cleanDateDF = spark.range(1).select( to_date(lit("2017-12-11"), dateFormat).alias("date"), to_date(lit("2017-20-12"), dateFormat).alias("date2")) cleanDateDF.createOrReplaceTempView("dateTable2") # COMMAND ---------- from pyspark.sql.functions import to_timestamp cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show() # COMMAND ---------- from pyspark.sql.functions import coalesce df.select(coalesce(col("Description"), col("CustomerId"))).show() # COMMAND ---------- df.na.drop("all", subset=["StockCode", "InvoiceNo"]) # COMMAND ----------
def main(): spark = SparkSession \ .builder \ .appName("spark_streaming_app") \ .getOrCreate() df = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', '104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092').option( 'subscribe', 'stream_data').option('startingOffsets', 'earliest').load()) df = df.selectExpr('CAST(value as STRING)') df = df.select(from_json(col('value'), data_schema).alias('df')) func1 = udf(lambda x: states[x.upper()], StringType()) df = df.filter(col('df.group.group_country')=='us').select('df').withColumn('group_state', func1('df.group.group_state')).\ withColumn('time', from_unixtime(col('df.event.time')/1000)) df2 = df.select( struct( struct( col('df.event.event_name'), col('df.event.event_id'), col('time'), ).alias('event'), col('df.group.group_city'), col('df.group.group_country'), col('df.group.group_id'), col('df.group.group_name'), col('group_state')).alias('value')) stream2 = df2.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "US-meetups") \ .option("checkpointLocation", "US-metups-checkpoint") stream2 = stream2.start() df3 = df.withColumn('timestamp', to_timestamp('time')).\ withWatermark('timestamp', "1 minute").groupBy(window('timestamp', '1 minute')).\ agg(struct(month('window.end').alias('month'), dayofmonth('window.end').alias('day_of_the_month'), hour('window.end').alias('hour'), minute('window.end').alias('minute'),collect_set('df.group.group_city').alias('cities')).alias('value')).\ select('value') stream3 = df3.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "US-cities-every-minute") \ .option("checkpointLocation", "US-cities-every-minute-checkpoint") stream3 = stream3.start() df4 = df.select( struct( struct( col('df.event.event_name'), col('df.event.event_id'), col('time'), ).alias('event'), col('df.group.group_topics.topic_name'), col('df.group.group_city'), col('df.group.group_country'), col('df.group.group_id'), col('df.group.group_name'), col('group_state')).alias('value')).filter( arrays_overlap( 'value.topic_name', array(lit("Computer programming"), lit("Big Data"), lit("Machine Learning"), lit("Python"), lit("Java"), lit("Web Development")))) stream4 = df4.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "Programming-meetups") \ .option("checkpointLocation", "Programming-metups-checkpoint") stream4 = stream4.start() stream4.awaitTermination() spark.stop()