def dateTransform(self, columns, currentFormat, outputFormat): """ :param columns Name date columns to be transformed. Columns ha :param currentFormat currentFormat is the current string dat format of columns specified. Of course, all columns specified must have the same format. Otherwise the function is going to return tons of null values because the transformations in the columns with different formats will fail. :param outputFormat output date string format to be expected. """ # Check if currentFormat argument a string datatype: self.__assertTypeStr(currentFormat, "currentFormat") # Check if outputFormat argument a string datatype: self.__assertTypeStr(outputFormat, "outputFormat") # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") if type(columns) == type('str'): columns = [columns] # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) exprs = [date_format(unix_timestamp(c, currentFormat).cast("timestamp"), outputFormat).alias( c) if c in columns else c for c in self.__df.columns] self.__df = self.__df.select(*exprs) self.__addTransformation() # checkpoint in case return self
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between(date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias( nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, 'log_data/*/*/*.json') # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col('page') == 'NextSong') # extract columns for users table, columns user_id, first_name, last_name, gender, level users_table = df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']).drop_duplicates(subset=['userID']).dropna(subset=['userID']) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'user'), 'overwrite') print('users table count:', users_table.count()) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x / 1000, IntegerType()) df = df.withColumn('start_time', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table, columns = start_time, hour, day, week, month, year, weekday time_table = df.select( col('datetime').alias('start_time'), hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'EEEE').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy(['year', 'month' ]).parquet(os.path.join(output_data, 'time'), 'overwrite') print('time table row counts:', time_table.count()) # read in song and artist data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'song')) artist_df = spark.read.parquet(os.path.join(output_data, 'artist')) # create temp view before join and extracted song_df.createOrReplaceTempView('song') df.createOrReplaceTempView('log') artist_df.createOrReplaceTempView('artist') # extract columns from joined song and log datasets to create songplays table columns = songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent songplays_table = spark.sql(""" SELECT l.datetime as start_time, l.userId as user_id, l.level, s.song_id, a.artist_id, l.sessionId as session_id, l.location, l.userAgent as user_agent, year(l.datetime) as year, month(l.datetime) as month FROM log as l LEFT JOIN song as s ON (l.song = s.title) LEFT JOIN artist as a ON (l.artist = a.name) AND (s.artist_id = a.artist_id) """) # add a primary key column from pyspark.sql.functions import monotonically_increasing_id songplays_table = songplays_table.withColumn('songplays_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy(['year', 'month']).parquet( os.path.join(output_data, 'songplay'), 'overwrite') print('song play row counts:', songplays_table.count())
StructField("locality", StringType(), True), StructField("town_city", StringType(), True), StructField("district", StringType(), True), StructField("county", StringType(), True), StructField("ppd_category_type", StringType(), True), StructField("record_status", StringType(), True) ]) log("Loading file...") pricePaid = df.option("header", "false").csv("/Users/manamohanpanda/Downloads/pp-complete.csv", enforceSchema=True, schema=schema) # filter greater london data filterGreaterLondon = pricePaid.filter(pricePaid["county"] == "GREATER LONDON") # filter district and county and their sale date as well as transform date in MM/yyyy format formatteddf = filterGreaterLondon.select("district","county",date_format(unix_timestamp("date_of_transfer", "yyyy-MM-dd").cast("timestamp"),"MM/yyyy").alias("dot")) log("---1--") # group data by date of transfer and district, count each district sale sorteddf = formatteddf.groupBy("dot","district").count() log("--2---") # current_year = 2018 months = ["01","02","03","04","05","06","07","08","09","10","11","12"] # iterate data for all years from 1995 to 2018, then sort and write top 5 data to month files # using coalesce(1) only for test purpose, it doesnt have to be and avoiding it will make it much faster for y in range(1995,2019): for x in months:
def do_post_preprocessing(self, preprocessed): return preprocessed.withColumn( "hour", func.hour("datetime")).withColumn( "week_day_number", func.date_format("datetime", "u").cast(IntegerType()))
def main(): # Get command line arguments BUCKET_NAME = sys.argv[1] DATASET_NAME = sys.argv[2] # Create a SparkSession under the name "setup" spark = SparkSession.builder.appName("setup").getOrCreate() spark.conf.set("temporaryGcsBucket", BUCKET_NAME) create_bigquery_dataset(DATASET_NAME) # Whether we are running the job as a test test = False # Check whether or not the job is running as a test if "--test" in sys.argv: test = True print("A subset of the whole dataset will be uploaded to BigQuery") else: print("Results will be uploaded to BigQuery") # Ingest External Datasets for table_name, data in EXTERNAL_TABLES.items(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) write_to_bigquery(df, table_name, DATASET_NAME) # Check if table exists try: df = spark.read.format("bigquery").option("table", TABLE).load() # if we are running a test, perform computations on a subset of the data if test: df = df.sample(False, 0.00001) except Py4JJavaError: print(f"{TABLE} does not exist. ") return # Declare dictionary with keys column names and values user defined # functions and return types udf_map = { "tripduration": (trip_duration, StringType()), "start_station_name": (station_name, StringType()), "start_station_latitude": (convert_angle, StringType()), "start_station_longitude": (convert_angle, StringType()), "end_station_name": (station_name, StringType()), "end_station_latitude": (convert_angle, StringType()), "end_station_longitude": (convert_angle, StringType()), "usertype": (user_type, StringType()), "gender": (gender, StringType()), } # Declare which columns to set some values to null randomly null_columns = [ "tripduration", "starttime", "stoptime", "start_station_latitude", "start_station_longitude", "end_station_latitude", "end_station_longitude", ] # Dirty the columns for name, udf in udf_map.items(): df = df.withColumn(name, UserDefinedFunction(*udf)(name)) # Format the datetimes correctly for name in ["starttime", "stoptime"]: df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss")) # Randomly set about 5% of the values in some columns to null for name in null_columns: df = df.withColumn( name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows dup_df = df.sample(True, 0.0001) # Create final dirty dataframe df = df.union(dup_df) print("Uploading citibike dataset...") write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME)
# COMMAND ---------- # MAGIC %md <h4>Create Time Dimension</h4> # COMMAND ---------- from pyspark.sql.functions import * from pyspark.sql.functions import concat, col, lit time = money.select("IssuedAt") \ .withColumn("year",year(money['IssuedAt'])) \ .withColumn("day",dayofmonth(money['IssuedAt'])) \ .withColumn("month",month(money['IssuedAt'])) \ .withColumn("hour",hour(money['IssuedAt'])) \ .withColumn("minute",minute(money['IssuedAt'])) \ .withColumn("dayofweek",date_format(money['IssuedAt'],'EEEE')) \ .withColumn("weekofyear",weekofyear(money['IssuedAt'])) \ .withColumn("time_id",concat(col("year"), col("month"), col("day"),col("hour"),col("minute"))) \ .drop('IssuedAt') \ .distinct() # COMMAND ---------- # MAGIC %md <h4>Create a Product Dimension</h4> # COMMAND ---------- from pyspark.sql.functions import * from pyspark.sql.functions import concat, col, lit product = money.select('ticketType','Price')
from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql import functions as F from pyspark.sql.functions import * import sys import pandas as pd sc = SparkSession \ .builder \ .appName("temperatures") \ .getOrCreate() # Please enter a set of cities as the command-line arguments cities = sys.argv # for example, Kyiv Paris data_file = "C:/Users/eugen/Documents/temperatures/temperatures.csv" df = sc.read.csv(data_file, header=True, sep=",").cache() df2 = df.withColumn('Temperature', df['Temperature'].cast('double')) dfByMonth = df2.withColumn('Date', F.date_format('Date', 'YYYY-MM')) # 4 Max, Min, Avg Temperature per month for a given N of cities df_cities = dfByMonth.filter(dfByMonth.City.isin(cities)) df_result = df_cities.groupBy("City", "Date").agg( max(col("Temperature")).alias("MaxTemp"), min(col("Temperature")).alias("MinTem"), round(mean(col("Temperature")), 1).alias("AvgTemp")) df_result.toPandas().to_csv("C:/Users/eugen/Documents/temperatures/task-4.csv")
def prep_travelers_data(config): """ Read travelers data in from SAS files into Spark and export to CSV """ # Initiate spark connection spark = SparkSession.builder.config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\ .enableHiveSupport().getOrCreate() # Read data file into spark dataframe i94_df = spark.read.format('com.github.saurfang.sas.spark').load( config['INPUT']['TRAVELERS']) # Rename columns travel_data = i94_df.selectExpr("i94port as iata_code", "arrdate as arrival_date", "i94bir as age", "i94visa as visa", "biryear as year_of_birth", "gender") # Filter out any non-existant airport codes travel_data = travel_data.filter(travel_data.iata_code != 'XXX') # Convert the SAS date to a regular date type start_date = datetime.datetime(1960, 1, 1) convert_sas_date = F.udf( lambda x: start_date + datetime.timedelta(days=int(x)) if x is not None else None, T.DateType()) travel_data_clean = travel_data.withColumn( 'arrival_date', convert_sas_date('arrival_date')) # Extract the arrival year, month, and day into separate columns travel_data_clean = travel_data_clean.withColumn( "arrival_year", F.date_format(F.col("arrival_date"), "y")) travel_data_clean = travel_data_clean.withColumn( "arrival_month", F.date_format(F.col("arrival_date"), "M")) travel_data_clean = travel_data_clean.withColumn( "arrival_day", F.date_format(F.col("arrival_date"), "d")) # Drop additional column and filter out nulls from gender travel_data_clean = travel_data_clean.drop(F.col('arrival_date')) travel_data_clean = travel_data_clean.filter( travel_data_clean.gender.isNotNull()) # Cast datatypes to the appropriate column types travel_data_final = travel_data_clean.selectExpr( "iata_code", "cast(age as int) as age", "cast(visa as int) as visa", "gender", "cast(year_of_birth as int) as year_of_birth", "cast(arrival_year as int) as arrival_year", "cast(arrival_month as int) as arrival_month", "cast(arrival_day as int) as arrival_day") # Export the dataframe to csv format travel_data_final.write.mode("overwrite").csv( config['OUTPUT']['FOLDER'] + '/' + config['OUTPUT']['TRAVELERS']) # Remove files that are not necessary for import to redshift for f in os.listdir(config['OUTPUT']['FOLDER'] + '/' + config['OUTPUT']['TRAVELERS']): if f.endswith('crc') or f.startswith('_'): os.remove( f"{config['OUTPUT']['FOLDER'] + '/' + config['OUTPUT']['TRAVELERS']}/{f}" )
gps_df = gps_df.withColumn( "Datetime", F.to_timestamp(F.substring(F.col("Datetime"), 0, 19))) # Import train_hire_stats.csv as dataframe using the defined schema schema = StructType([ StructField("Zone_ID", ByteType(), False), StructField("Date", TimestampType(), False), StructField("Hour_slot", ByteType(), False), StructField("Hire_count", ShortType(), False) ]) train_df = spark.read.format("csv").option("header", "true").option( "delimiter", ",").schema(schema).load("data/train_hire_stats.csv") train_df = train_df.withColumn( 'Day_of_the_week', (F.date_format(train_df["Date"], "u").cast(IntegerType()))) train_df = train_df.withColumn( 'Month', (F.date_format(train_df["Date"], "M").cast(IntegerType()))) # Import test_hire_stats.csv as dataframe using the defined schema schema = StructType([ StructField("Test_ID", ShortType(), False), StructField("Zone_ID", ByteType(), False), StructField("Date", TimestampType(), False), StructField("Hour_slot", ByteType(), False), StructField("Hire_count", ByteType(), False) ]) test_df = spark.read.format("csv").option("header", "true").option( "delimiter", ",").schema(schema).load("data/test_hire_stats.csv")
# COMMAND ---------- train_df = spark.sql("SELECT * FROM traindf where StoreState ='Pichincha'") # COMMAND ---------- train_df.show() # COMMAND ---------- train_df = train_df.withColumnRenamed("Date","Date_Date") # COMMAND ---------- from pyspark.sql.functions import date_format df3 = train_df.select('Date_Date', date_format('Date_Date', 'u').alias('dow_number'), date_format('Date_Date', 'E').alias('dow_string')) df3 = df3.distinct() df3 = df3.withColumnRenamed("Date_Date","Date2") # COMMAND ---------- store_dept_data = train_df.groupBy("StoreState","ItemFamily", "Date_Date").sum("Units").orderBy("Date_Date").join(df3,df3.Date2 == train_df.Date_Date) # COMMAND ---------- store_dept_data.show(10) # COMMAND ---------- store_dept_data = store_dept_data.join(stores, (stores.state == store_dept_data.StoreState), "left")
"cell_250m_lon", llc_lon + nyc["x_250m_cell"] * x_grid_step + 0.5 * x_grid_step) nyc = nyc.withColumn("y_250m_cell", ((nyc["lat"] - llc_lat) / y_grid_step).cast('integer')) nyc = nyc.withColumn( "cell_250m_lat", llc_lat + nyc["y_250m_cell"] * y_grid_step + 0.5 * y_grid_step) nyc = nyc.withColumn('cell_index', concat(col("x_250m_cell"), lit(";"), col("y_250m_cell"))) # create hour column nyc = nyc.withColumn( "hour", date_format(col("timestamp").cast("timestamp"), "yyyy-MM-dd HH:00")) # count cell aggregations and save to file hourly_counts = nyc.groupby("hour", "cell_index", "class").agg(countDistinct("ad_id_upper")) hourly_counts.write \ .format("com.databricks.spark.csv") \ .mode("overwrite") \ .save("/user/bjb417/covid/output/nyc/nyc_land_use/nyc_250mGrid_landUse_uniqueDev_hourlyCounts_active14days.csv") # save 250m x 250m grid information grid = nyc.select("cell_index", "x_250m_cell", "y_250m_cell", "cell_250m_lon", "cell_250m_lat") \ .drop_duplicates(subset=['cell_index']) grid.write \
def join_function(path_nbr, path_nlu): ''' returns joined spark df ''' #nlu # path = '/Users/amirdavidoff/Desktop/data/enriched_data/nlu' spark_nlu = sqlContext.read.parquet(path_nlu) spark_nlu = spark_nlu.filter((F.to_date("date") >= F.lit("2019-07-01"))) #nbr # path = '/Users/amirdavidoff/Desktop/data/enriched_data/nbr' spark_nbr = sqlContext.read.parquet(path_nbr) spark_nbr = spark_nbr.filter((F.to_date("date") >= F.lit("2019-07-01"))) spark_nbr = spark_nbr.withColumn('source', F.lit('nbr')) spark_nlu = spark_nlu.withColumn('source', F.lit('nlu')) #changed column names for c in spark_nbr.columns: spark_nbr = spark_nbr.withColumnRenamed(c, "nbr_{}".format(c)) for c in spark_nlu.columns: spark_nlu = spark_nlu.withColumnRenamed(c, "nlu_{}".format(c)) nbr_cols = [ 'nbr_sender_id', 'nbr_retailer_id', 'nbr_timestamp', 'nbr_ts_plus_response', 'nbr_conv', 'nbr_ack_text', 'nbr_response_code', 'nbr_possible_values', 'nbr_source', 'nbr_date' ] nlu_cols = [ 'nlu_sender_id', 'nlu_retailer_id', 'nlu_gender', 'nlu_age_group', 'nlu_text', 'nlu_timestamp', 'nlu_intents_list', 'nlu_subvertical', 'nlu_positive_aspects', 'nlu_positive_product_type', 'nlu_positive_brands', 'nlu_negative_aspects', 'nlu_negative_product_type', 'nlu_conv', 'nlu_source', 'nlu_date' ] spark_nbr2 = spark_nbr.select(nbr_cols) spark_nlu2 = spark_nlu.select(nlu_cols) jnd = spark_nlu2.join(spark_nbr2, spark_nbr2.nbr_source == spark_nlu2.nlu_source, how='full_outer') #jnd.count() collect_values_udf = F.udf(collect_values, ArrayType(StringType())) jnd = jnd.withColumn('nbr_possible_answers', collect_values_udf(F.col('nbr_possible_values'))) jnd = jnd.withColumn( 'jnd_sender_id', F.when(F.col('nlu_sender_id').isNull(), F.col('nbr_sender_id')).otherwise(F.col('nlu_sender_id'))) jnd = jnd.withColumn( 'jnd_ts', F.when(F.col('nbr_ts_plus_response').isNull(), F.col('nlu_timestamp')).otherwise( F.col('nbr_ts_plus_response'))) jnd = jnd.withColumn( 'jnd_retailer', F.when(F.col('nlu_retailer_id').isNull(), F.col('nbr_retailer_id')).otherwise(F.col('nlu_retailer_id'))) # function that marks q's as answered ''' could make this function beter with first\last (that are not none's) instead of taking lad and lag2''' def check_isin(lead_nlu_text, lead_nlu_text2, possible_values, question_code, lag_positive_aspects, lead_positive_aspects, lag_subvertical, lead_subvertical, lag_pos_product_type, lead_pos_product_type, lead_pos_product_type2): try: # check if response value is in quick replies if (lead_nlu_text in possible_values) or (lead_nlu_text2 in possible_values): return 1 if (question_code == 'color_question') and ( 'color' not in lag_positive_aspects) and ( 'color' in lead_positive_aspects): return 1 if (question_code == 'subvertical_selection' or question_code == 'subvertical_selection_second') and ( lag_subvertical is None) and (lead_subvertical is not None): return 1 if (question_code == 'product_type_selection') and ( lag_pos_product_type is None) and ( (lead_pos_product_type is not None) or (lead_pos_product_type2 is not None)): return 1 else: return 0 except: None check_isin_udf = F.udf(check_isin, IntegerType()) window = Window.partitionBy("jnd_sender_id").orderBy(["jnd_ts"]) jnd = jnd.withColumn( 'is_answered', check_isin_udf( F.lead('nlu_text').over(window), F.lead('nlu_text', 2).over(window), F.col('nbr_possible_answers'), F.col('nbr_response_code'), F.lag('nlu_positive_aspects').over(window), F.lead('nlu_positive_aspects').over(window), F.lag('nlu_subvertical').over(window), F.lead('nlu_subvertical').over(window), F.lag('nlu_positive_product_type').over(window), F.lead('nlu_positive_product_type').over(window), F.lead('nlu_positive_product_type', 2).over(window))) jnd = jnd.fillna({'is_answered': 0}) # fix ids l22y83vocf, 00fma5y5xgf ''' data set features ''' ''' DONT FORGET THAT YOUVE ADDED RESPONSE TIME TO TS MIGHT BE A HUGE BIAS ''' jnd = jnd.withColumn( 'question_rank', F.sum(F.when(F.col('nbr_response_code').isNotNull(), 1).otherwise(0)).over(window)) jnd = jnd.withColumn('time_from_start', F.col('jnd_ts') - F.min('jnd_ts').over(window)) jnd = jnd.withColumn('sum_answer', F.sum(F.lag('is_answered').over(window)).over(window)) jnd = jnd.withColumn('num_quick_replies', F.size('nbr_possible_answers')) jnd = jnd.withColumn('hour', F.hour('nbr_date')) jnd = jnd.withColumn('day_of_week', F.date_format('nbr_date', 'u')) jnd = jnd.withColumn( "last_nbr_code", F.last(F.lag("nbr_response_code").over(window), True).over(window)) nlu_cols = [ 'nlu_intents_list', 'nlu_age_group', 'nlu_gender', 'nlu_subvertical', 'nlu_positive_aspects', 'nlu_positive_product_type', 'nlu_positive_brands', 'nlu_negative_aspects', 'nlu_negative_product_type' ] for c in nlu_cols: jnd = jnd.withColumn("last_{}".format(c), F.last(c, True).over(window)) return jnd
bins_y=np.array(y_cells).tolist()[0] # get the bound value interval_lon = (r_lon+lon_det-(l_lon-lon_det))/x_n interval_lat = (t_lat+lat_det-(b_lat-lat_det))/y_n min_lon = l_lon-lon_det min_lat = b_lat-lat_det Get the coordinates of each position # read trackestimate table which contanis each location trackestimate_table = "birds.trackestimate" trackestimate = hc.read.table(trackestimate_table) track_subset =trackestimate.persist() # transform the time format to drop those half seconds trackestimate_subset = track_subset.withColumn('dt', F.date_format('timestamp', 'yyyy-MM-dd HH:mm')) # define function to get the coordinates # udf_x = UserDefinedFunction(lambda x: str(loads(x,hex=True).__geo_interface__['coordinates'][0]), StringType()) # udf_y = UserDefinedFunction(lambda x: str(loads(x,hex=True).__geo_interface__['coordinates'][1]), StringType()) def do_something_to_cell(geo_string): return [cell.split(' ') for cell in str(geo_string[9:-1]).split(' ')] udf_x = UserDefinedFunction(lambda x: do_something_to_cell(x)[0][0], StringType()) udf_y = UserDefinedFunction(lambda x: do_something_to_cell(x)[1][0], StringType()) # transform the coordinates and the datatype trackestimate_subset_coord=trackestimate_subset.withColumn('position_x', udf_x(F.col('st_astext')).astype('float')) trackestimate_subset_coord=trackestimate_subset_coord.withColumn('position_y',udf_y(F.col('st_astext')).astype('float')) Assign the coordinates into cells
def process_log_data(spark, input_data, output_data): ''' Loads the users,time and song_plays tables from the s3 location, creates tables and loads them back to s3 location as parquet Parameters: spark -- spark session input_data -- input s3 location output data -- output s3 location Returns: None ''' # get filepath to log data file log_data = os.path.join(input_data, "log-data/2018/11/*.json") # read log data file dflogs = spark.read.json(log_data) # filter by actions for song plays filterDF = dflogs.where("page=='NextSong'") # extract columns for users table #artists_table = dfUserWithSchema = dflogs.select([ c for c in dflogs.columns if c in ['userId', 'firstName', 'lastName', 'gender', 'level'] ]) dfUserWithSchema.createOrReplaceTempView("users") # write users table to parquet files dfUserWithSchema.write.mode('overwrite').parquet( "s3a://udacity-demo-1-1/users.parquet") print("complete users file") spark.sql("SELECT count(*) FROM users").show() # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) timestDF = filterDF.withColumn("timestamp", get_timestamp(filterDF.ts)) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) datetimeDF = filterDF.withColumn("datetime", get_datetime(filterDF.ts)) # extract columns to create time table get_time_val = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%H-%M-%S')) timeallDF = timestDF.withColumn('starttime', get_time_val(filterDF.ts)) timeallDF = timeallDF.withColumn('hour', hour('timestamp')) timeallDF = timeallDF.withColumn('day', dayofmonth('timestamp')) timeallDF = timeallDF.withColumn('week', weekofyear('timestamp')) timeallDF = timeallDF.withColumn('month', month('timestamp')) timeallDF = timeallDF.withColumn('year', year('timestamp')) timeallDF = timeallDF.withColumn('weekday', date_format('timestamp', 'E')) timeDF = timeallDF.select([ c for c in timeallDF.columns if c in ['starttime', 'hour', 'day', 'week', 'month', 'year', 'weekday'] ]) parqtimeDF = timeDF.write.partitionBy("year", "month").mode( 'overwrite').parquet("s3a://udacity-demo-1-1/time.parquet") parquettimeDF = spark.read.parquet("s3a://udacity-demo-1-1/time.parquet") parquettimeDF.createOrReplaceTempView("time") print("complete time file") spark.sql("SELECT count(*) FROM time").show() # read in song data to use for songplays table song_artist_df = spark.sql( "SELECT song_id, a.artist_id,title,artist_name,duration FROM songs a inner join artists b where a.artist_id=b.artist_id" ) song_artist_df.createOrReplaceTempView("song_artist") # extract columns from joined song and log datasets to create songplays table songplays_table = filterDF.join(song_artist_df, (filterDF.song == song_artist_df.title) & (filterDF.artist == song_artist_df.artist_name), 'left_outer')\ .select( get_time_val(filterDF.ts).alias('starttime'), col("userId").alias('user_id'), filterDF.level, (song_artist_df.title).alias('song_id'), song_artist_df.artist_id, col("sessionId").alias("session_id"), filterDF.location, col("useragent").alias("user_agent"), year(get_datetime(filterDF.ts)).alias('year'), month(get_datetime(filterDF.ts)).alias('month') ) songplays_table = songplays_table.withColumn('songplay_id', monotonically_increasing_id()) songplays_table.createOrReplaceTempView("song_plays") # write songplays table to parquet files partitioned by year and month parqsongplaysDF = songplays_table.write.partitionBy('year', 'month').mode( 'overwrite').parquet("s3a://udacity-demo-1-1/songplay.parquet") print("complete song_plays file") spark.sql("SELECT count(*) FROM song_plays").show()
# MAGIC %md # MAGIC Next, we'll use the time functions to convert our timestamp into Central European Summer Time (CEST). # COMMAND ---------- filtered.select('timestamp').show(5) # COMMAND ---------- # MAGIC %md # MAGIC Let's try applying `date_format` to see how it operates. # COMMAND ---------- (filtered .select('timestamp', func.date_format('timestamp', 'MM/dd/yyyy').alias('date')) .show(5)) # COMMAND ---------- withDate = filtered.withColumn('date', func.date_format('timestamp', 'MM/dd/yyyy')) withDate.printSchema() withDate.select('title', 'timestamp', 'date').show(3) # COMMAND ---------- # MAGIC %md # MAGIC It seems like we want a different function for time zone manipulation and to store the object as a timestamp rather than a string. Let's use `from_utc_timestamp` to get a timestamp object back with the correct time zone. # COMMAND ----------
def getFeature(hourlyfeaturedf, scoreBegin): featureeddf = hourlyfeaturedf print(hourlyfeaturedf.columns) hourlyfeaturedf.show(5) scoreBegin, scoreEnd, featureBegin, scoreEndDateTime, featureBeginDateTime, featureEndDateTime = getScoreTime( scoreBegin) featureeddf = featureeddf.filter( featureeddf.StartHour >= lit(scoreBegin).cast(TimestampType())).filter( featureeddf.StartHour < lit(scoreEnd).cast(TimestampType())) # Extract some time features from "SessionStartHourTime" column featureeddf = featureeddf.withColumn('year', year(featureeddf['StartHour'])) featureeddf = featureeddf.withColumn('month', month(featureeddf['StartHour'])) featureeddf = featureeddf.withColumn('hourofday', hour(featureeddf['StartHour'])) featureeddf = featureeddf.withColumn('weekofyear', weekofyear(featureeddf['StartHour'])) dayofweek = F.date_format(featureeddf['StartHour'], 'EEEE') featureeddf = featureeddf.withColumn('dayofweek', dayofweek) featureeddf = featureeddf.withColumn('dayofmonth', hour(featureeddf['StartHour'])) import datetime trainBeginTimestamp = int( datetime.datetime.strftime( datetime.datetime.strptime(trainBegin, "%Y-%m-%d %H:%M:%S"), "%s")) def linearTrend(x): if x is None: return 0 # return # of hour since the start of the training period return (x - trainBeginTimestamp) / 3600 / 24 / 365.25 linearTrendUdf = udf(linearTrend, IntegerType()) featureeddf = featureeddf.withColumn( 'linearTrend', linearTrendUdf(F.unix_timestamp('StartHour'))) cal = USFederalHolidayCalendar() holidays_datetime = cal.holidays(start=holidayBegin, end=holidayEnd).to_pydatetime() holidays = [t.strftime("%Y-%m-%d") for t in holidays_datetime] def isHoliday(x): if x is None: return 0 if x in holidays: return 1 else: return 0 isHolidayUdf = udf(isHoliday, IntegerType()) featureeddf = featureeddf.withColumn( 'date', date_format(col('StartHour'), 'yyyy-MM-dd')) featureeddf = featureeddf.withColumn("Holiday", isHolidayUdf('date')) def isBusinessHour(x): if x is None: return 0 if x >= 8 and x <= 18: return 1 else: return 0 isBusinessHourUdf = udf(isBusinessHour, IntegerType()) featureeddf = featureeddf.withColumn("BusinessHour", isBusinessHourUdf('hourofday')) def isMorning(x): if x is None: return 0 if x >= 6 and x <= 9: return 1 else: return 0 isMorningUdf = udf(isMorning, IntegerType()) featureeddf = featureeddf.withColumn("Morning", isMorningUdf('hourofday')) featureeddf.persist() return featureeddf
from __future__ import print_function import pyspark from pyspark.sql import functions as F import drpyspark drpyspark.enable_debug_output() with pyspark.SparkContext() as sc: sqlContext = pyspark.sql.SQLContext(sc) logs = sc.parallelize([ {'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1'}, {'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1'}, {'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2'}, {'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2'}, {'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2'}, ]) logs = logs.map(lambda l: pyspark.sql.Row(**l)) logs = (sqlContext.createDataFrame(logs) .withColumn('timestamp', F.to_date(F.from_unixtime('timestamp'))) .withColumn('minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH"))) (logs .groupBy(['minute', 'url']) .count() .show())
def ingest(self, src_resource=None, src_path=None, src_provider=None, dest_resource=None, dest_path=None, dest_provider=None, eventsourcing=False): logger = logging.getLogger() #### contants: now = datetime.now() reserved_cols = ['_ingested', '_date', '_state'] #### Source metadata: md_src = data.metadata(src_resource, src_path, src_provider) if not md_src: logger.error("No metadata") return # filter settings from src (provider and resource) filter_params = utils.merge( md_src['provider'].get('read', {}).get('filter', {}), md_src['resource'].get('read', {}).get('filter', {})) #### Target metadata: # default path for destination is src path if (not dest_resource) and (not dest_path) and dest_provider: dest_path = md_src['resource']['path'] md_dest = data.metadata(dest_resource, dest_path, dest_provider) if not md_dest: return if 'read' not in md_dest['resource']: md_dest['resource']['read'] = {} # match filter with the one from source resource md_dest['resource']['read']['filter'] = filter_params #### Read source resource try: df_src = self._read(md_src) except Exception as e: logger.exception(e) return #### Read destination schema info try: schema_path = '{}/schema'.format(md_dest['resource']['path']) md = data.metadata(path=schema_path, provider=dest_provider) df_schema = self._read(md) schema_date_str = df_schema.sort( desc("date")).limit(1).collect()[0]['id'] except Exception as e: # logger.warning('source schema does not exist yet.'') schema_date_str = now.strftime('%Y%m%dT%H%M%S') # destination path - append schema date dest_path = '{}/{}'.format(md_dest['resource']['path'], schema_date_str) md_dest['resource']['path'] = dest_path md_dest['url'] = data._url(md_dest) # if schema not present or schema change detected schema_changed = True try: df_dest = self._read(md_dest) # compare schemas df_src_cols = [x for x in df_src.columns if x not in reserved_cols] df_dest_cols = [ x for x in df_dest.columns if x not in reserved_cols ] schema_changed = df_src[df_src_cols].schema.json( ) != df_dest[df_dest_cols].schema.json() except Exception as e: # logger.warning('schema does not exist yet.'') df_dest = df_src.filter("False") if schema_changed: # Different schema, update schema table with new entry schema_entry = (schema_date_str, now, df_src.schema.json()) df_schema = self.context().createDataFrame( [schema_entry], ['id', 'date', 'schema']) # write the schema to destination provider md = data.metadata(path=schema_path, provider=md_dest['resource']['provider']) self._write(df_schema, md, mode='append') # partitions partition_cols = ['_ingested'] #init df_diff to empty dest dataframe df_diff = df_dest.filter("False") if not eventsourcing: if filter_params.get('policy') == 'date' and filter_params.get( 'column'): df_diff = dataframe_update(df_src, df_dest, updated_col='_ingested', eventsourcing=eventsourcing) df_diff = df_diff.withColumn( '_date', date_format( from_utc_timestamp(filter_params['column'], 'GMT+7'), 'yyyy-MM-dd')) partition_cols += ['_date'] ingest_mode = 'append' options = {'mode': ingest_mode, 'partitionBy': partition_cols} else: df_diff = dataframe_update(df_src, df_dest.filter("False"), updated_col='_ingested', eventsourcing=eventsourcing) ingest_mode = 'overwrite' options = {'mode': ingest_mode, 'partitionBy': partition_cols} else: # to do logger.fatal('event sourcing not implemented yet') records_add = df_diff.filter("_state = 0").count() records_del = df_diff.filter("_state = 1").count() if records_add or records_del or schema_changed: md = data.metadata(path=dest_path, provider=md_dest['resource']['provider']) self._write(df_diff, md, **options) end = datetime.now() time_diff = end - now logdata = { 'src_url': md_src['url'], 'src_table': md_src['resource']['path'], 'source_option': filter_params, 'schema_change': schema_changed, 'target': dest_path, 'upserts': records_add, 'deletes': records_del, 'diff_time': time_diff.total_seconds() } logtype = { 'dlf_type': '{}.{}'.format(self.__class__.__name__, func_name()) } logger.info(logdata, extra=logtype)
db_analytical_temp + ".Euw_aggregated_cust_zip_temp b on a.CONCAT_AGMNT_NO==b.concat_agmnt_no and a.CUSTOMER_ID==b.customer_id and a.ACCOUNT_SEQ==b.account_seq and a.AGREEMENT_SEQ==b.agreement_seq" ) AMInonAMI_Zip = AMInonAMI_Zip.withColumnRenamed("TOWN_CODE", "ZIP_CODE") AMInonAMI_Zip = AMInonAMI_Zip.withColumn( 'USAGE_VALUE', F.col('USAGE_VALUE').cast(DoubleType())) fileLog("reading AMInonAMI_Zip data") ### reading weather_dcast data Wthr_Dcast = spark.sql("select * from " + db_analytical_temp + ".Euw_weather_data_temp") date_format_function = udf(lambda x: dtm.strptime(x, '%Y-%m-%d'), DateType()) Wthr_Dcast = Wthr_Dcast.withColumn( "WEATHER_DATE", date_format_function(date_format(col("WEATHER_DATE"), "yyyy-MM-dd"))) fileLog("reading weather_dcast data") # take the unique set out of it usage_set = AMInonAMI_Zip.select('CONCAT_AGMNT_NO', 'CUSTOMER_ID', 'ACCOUNT_SEQ', 'AGREEMENT_SEQ', 'SITE_ID', 'SERVICE_SEQ', 'ZIP_CODE').distinct() ##populate additional future dates wrt usage_date and weather dates ### fileLog( "reading unique set of agreements and populating the future dates wrt usage_date and weather dates" ) last_usage_date = AMInonAMI_Zip.agg( max('USAGE_DATE').alias('max_usage')).first()[0] #+timedelta(1) last_weather_date = Wthr_Dcast.agg( max('WEATHER_DATE').alias('max_weather')).first()[0] my_udf = lambda domain: [
def process_log_data(spark, input_data, output_data): '''Creates time, users and songplays tables in S3 Args: spark: the spark session created by the create_spark_sessioin function input_data: the location of the song data in S3 output_data: the location of the time, users and songplays tables in S3 ''' # get filepath to log data file log_data = f'{input_data}/log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(col('page').isin('NextSong')) # extract columns for users table users_table = df.selectExpr('userId AS user_id', 'firstName AS first_name', 'lastName AS last_name', 'gender', 'level').distinct() # write users table to parquet files users_table.write.parquet(path=f'{output_data}/users_table/', mode='overwrite') # create tiimestamp column from original timestamp column df = df.withColumn( 'start_time', date_format(to_timestamp(col('ts') / 1000), format='yyyy-MM-dd hh:mm:ss')) # extract columns to create time table time_table = (df.select('start_time').withColumn( 'year', year(col('start_time'))).withColumn( 'month', month(col('start_time'))).withColumn( 'dayofmonth', dayofmonth(col('start_time'))).withColumn( 'hour', hour(col('start_time'))).withColumn( 'weekofyear', weekofyear(col('start_time'))).distinct()) # write time table to parquet files partitioned by year and month time_table.write.parquet(path=f'{output_data}/time_table/', mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table dfSong = spark.read.format("json").load(song_data) # extract columns from joined song and log datasets to create songplays table cond = [ dfSong.title == df.song, dfSong.artist_name == df.artist, dfSong.duration == df.length ] dfJoined = df.join(dfSong, cond, how='inner') songplays_table = dfJoined.selectExpr( 'ts AS start_time', 'userId AS user_id', 'level', 'song_id', 'artist_id', 'sessionId AS session_id', 'artist_location AS location', 'userAgent AS user_agent').distinct() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(path=f'{output_data}/songplay_table/', mode='overwrite')
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "live_raw.csv", "live") monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "history.csv", "history") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def process_log_data(spark, input_data, output_data, songs_data): """ Transforms expected input data JSON file into 3 analytics dataframes, and writes them out to the output location given. Parameters: -- spark - spark session object -- input_data - string ['LOCAL' OR 'REMOTE'] specifying which path to read from config file -- output_data - string ['LOCAL' OR 'REMOTE'] specifying which path to read from config file """ # get filepath to log data file log_data = config[input_data]['LOG_DATA'] # Set output filepath output_location = config[output_data]['OUTPUT_PATH'] # read log data file log_df = spark.read.format('json').load(log_data) # filter by actions for song plays log_df = log_df.filter(col('page') == 'NextSong') # extract columns for users table users_table = log_df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender'), col('level')).distinct() # write users table to parquet files users_table.write.partitionBy('gender').parquet( os.path.join(output_location + "/users", "users_table"), "overwrite") # create datetime column from original timestamp column log_df = log_df.withColumn('timestamp', from_unixtime(col('ts') / 1000)).drop('ts') # extract columns to create time table time_table = log_df.select( date_format('timestamp', 'HH:MM:ss').alias('start_time'), hour('timestamp').alias('hour'), dayofmonth('timestamp').alias('day'), weekofyear('timestamp').alias('week'), month('timestamp').alias('month'), year('timestamp').alias('year'), date_format('timestamp', 'u').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_location + "/time", "time_table"), "overwrite") # read in song data to use for songplays table song_df = songs_data.distinct() # extract columns from joined song and log datasets to create songplays table songplays_table = log_df.join(song_df, (log_df.song == song_df.title) & (log_df.artist == song_df.artist_name))\ .withColumn('songplay_id', monotonically_increasing_id())\ .withColumn('month', month('timestamp'))\ .select(col('songplay_id') , date_format('timestamp', 'HH:MM:ss').alias('start_time') , col('userId').alias('user_id') , col('level') , col('song_id') , col('artist_id') , col('sessionId').alias('session_id') , col('location') , col('userAgent').alias('user_agent') , col('year') , col('month')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_location + "/songplays", "songplays_table"), "overwrite")
def process_log_data(spark, input_data, output_data): """ This function process log data and extracts 3 tables in parquet format - users, time, and songplay. The timestamp in log data is broken down into hour, day, week, month, year, and weekday. Args: - spark: A Spark Session - input_data: S3 link to Log data - output_data: S3 link to drop extracted tables and grab the songs data that was created in func 'process_song_data' Returns: None """ # read log data file log_df = spark.read.json(input_data) # filter by actions for song plays log_filtered_df = log_df.where(log_df.page == "NextSong") # extract columns for users table users_table = log_filtered_df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates(['userId']) # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + '/users/users.parquet') # create timestamp column from original timestamp column # Columns ts is in milliseconds. Divide by 1000 to get the results in seconds and convert to Timestamp Type. log_filtered_df = log_filtered_df.withColumn( 'tsconvert', (col('ts') / 1000).cast(TimestampType())) log_filtered_df.createOrReplaceTempView("log_staging") # extract columns to create time table time_table = log_filtered_df.select( col('tsconvert').alias('start_time'), hour('tsconvert').alias('hour'), dayofmonth('tsconvert').alias('day'), weekofyear('tsconvert').alias('week'), month('tsconvert').alias('month'), year('tsconvert').alias('year'), date_format('tsconvert', 'EEEE').alias('weekday')).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy( 'year', 'month').mode('overwrite').parquet(output_data + '/time/time.parquet') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "/song_data/songs.parquet") song_df.createOrReplaceTempView("songs_staging") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql( """SELECT DISTINCT logs.tsconvert AS starttime, logs.userId, logs.level, songs.song_id, songs.artist_id, logs.sessionId, logs.location, logs.userAgent, year(logs.tsconvert) as year, month(logs.tsconvert) as month FROM songs_staging AS songs INNER JOIN log_staging AS logs ON logs.song = songs.title""") # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy( 'year', 'month').mode('overwrite').parquet(output_data + '/songplays/songplays.parquet')
def process_log_data(spark, input_data, output_data): """ Processes log data and writes the users, the time, and the songplays table into specified S3 bucket in parquet format. Parameters ------- spark: object Spark Session object to handle the Spark Processes input_data: str The location of the files to read from S3 Bucket output_data: str The location of the files to write into S3 Bucket """ # get filepath to log data file log_data = input_data + "log_data/*/*/*events.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_cols = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(users_cols).drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/") # create datetime column from original timestamp column get_datetime = F.udf(lambda x: D.fromtimestamp(int(x / 1000)), T.TimestampType()) df = df.withColumn("start_time", get_datetime("ts")) # extract columns to create time table time_table = df.select("start_time") \ .withColumn("hour", F.hour("start_time")) \ .withColumn("day", F.dayofmonth("start_time")) \ .withColumn("week", F.weekofyear("start_time")) \ .withColumn("month", F.month("start_time")) \ .withColumn("year", F.year("start_time")) \ .withColumn("weekday", F.date_format('start_time', 'EEEE')).drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'time/') # read in song data to use for songplays table songs_table = spark.read.parquet(output_data + "songs/") songs_table = songs_table.selectExpr( ["song_id", "artist_id as s_artist_id", "title"]) artists_table = spark.read.parquet(output_data + "artists/") artists_table = artists_table.select(["artist_id", "location", "name"]) song_df = songs_table.join( artists_table, songs_table.s_artist_id == artists_table.artist_id, "inner") cols = [ "start_time", "userId as user_id", "level", "sessionId as session_id", "userAgent as user_agent", "song", "artist" ] df = df.selectExpr(cols) \ .withColumn("songplay_id", F.monotonically_increasing_id()) \ .withColumn("month", F.month("start_time")) \ .withColumn("year", F.year("start_time")) df = df.join(song_df, (df.song == song_df.title) & (df.artist == song_df.name), "left") # extract columns from joined song and log datasets to create songplays table songplays_cols = [ "songplay_id", "start_time", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month" ] songplays_table = df.select(songplays_cols) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
import sys from pyspark.sql import SparkSession from pyspark.sql.functions import format_string, date_format if __name__ == "__main__": spark = SparkSession.builder.getOrCreate() # reading into DF parkingDF = spark.read.format('csv').options(header = 'true', inferschema = 'true').load(sys.argv[1]) openDF = spark.read.format('csv').options(header = 'true', inferschema = 'true').load(sys.argv[2]) # creating SQL temp view from DF parkingDF.createOrReplaceTempView("parking") openDF.createOrReplaceTempView("open") # using subtract to get (parking - open) parkingDF.select('summons_number').subtract(openDF.select('summons_number')).createOrReplaceTempView("temp1") query = """ select parking.summons_number, plate_id, violation_precinct, violation_code, issue_date from parking join temp1 using(summons_number) order by parking.summons_number """ result = spark.sql(query) # formatting and saving the result result.select(format_string('%d\t%s, %d, %d, %s', result.summons_number, result.plate_id, result.violation_precinct, result.violation_code, date_format(result.issue_date, 'yyyy-MM-dd'))).write.save("task1-sql.out", format = "text")
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300): """Create `dim_date` table containing various date feature columns. Args: spark (SparkSession): Instantiated SparkSession start_year (int): starting year for dim_date table. number_years_out_from_start (int): number out from `start_year` to increment. Returns: Spark DataFrame. """ years = [start_year + i for i in range(number_years_out_from_start + 1)] months = [i for i in range(1, 13)] days = [i for i in range(1, 32)] years_df = spark.createDataFrame( pd.DataFrame({ 'year': years, 'temp_join_key': '1' })) months_df = spark.createDataFrame( pd.DataFrame({ 'month': months, 'temp_join_key': '1' })) days_df = spark.createDataFrame( pd.DataFrame({ 'day_of_month': days, 'temp_join_key': '1' })) years_months_df = (years_df.join(months_df, ['temp_join_key'], how='inner')) years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'], how='inner')) date_keys = ( years_month_days_df.withColumn( 'date', to_date( concat(col('year'), lpad(col('month'), 2, '0'), lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd')) # remove invalid dates .filter("date IS NOT NULL").withColumn( 'date_key', regexp_replace(col('date').cast('string'), '-', '').cast('integer'))) date_features = (date_keys # get `week` and `quarter` .withColumn('week', weekofyear(col('date'))) .withColumn('quarter', quarter(col('date'))) # get `day_name` and `month_name` .withColumn('day_name', date_format(col('date'), 'EEEE')) .withColumn('month_name', date_format(col('date'), 'MMMM')) # get `date_year`, `date_quarter`, `date_month`, `date_week` .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)")) .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01')) .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)")) .withColumn('date_year', date_format(col('date'), 'yyyy-01-01')) # get `day_of_week`, `day_of_quarter`, `day_of_year` .withColumn('day_of_week', dayofweek(col('date'))) .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1)) .withColumn('day_of_year', dayofyear(col('date'))) # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`, # `month_start_flag`, `month_end_flag` .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N')) .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp'))) .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) | (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N')) .withColumn('leap_year_flag', when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y') .otherwise('N')) .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N')) .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N')) # get `pct_into_month`, `pct_into_quarter`, `pct_into_year` .withColumn('pct_into_month', (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)')) .withColumn('date_quarter_end', when(col('quarter') == lit(1), concat(col('year'), lit('-03-31'))) .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30'))) .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30'))) .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31'))) .otherwise(None) .cast('date')) .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1)) .withColumn('pct_into_quarter', (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)')) .withColumn('pct_into_year', (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0)) .cast('decimal(7, 6)')) # get seasons .withColumn('season_northern', when(col('month').isin(12, 1, 2), 'Winter') .when(col('month').isin(3, 4, 5), 'Spring') .when(col('month').isin(6, 7, 8), 'Summer') .when(col('month').isin(9, 10, 11), 'Fall') .otherwise('UNKNOWN')) .withColumn('season_southern', when(col('month').isin(6, 7, 8), 'Winter') .when(col('month').isin(9, 10, 11), 'Spring') .when(col('month').isin(12, 1, 2), 'Summer') .when(col('month').isin(3, 4, 5), 'Fall') .otherwise('UNKNOWN'))) dim_date = (date_features.sort('date').select([ 'date_key', 'date', 'date_week', 'date_month', 'date_quarter', 'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter', 'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter', 'day_name', 'month_name', 'season_northern', 'season_southern', 'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag', 'month_start_flag', 'month_end_flag', 'leap_year_flag', 'pct_into_month', 'pct_into_quarter', 'pct_into_year' ])) return dim_date
def process_log_data(spark, input_data, output_data): """ Fetch log data from S3, processes it and extract users_table, time_table and songplays_tables from it. Convert the data frames to parquet files and loaded back to S3 as output_data. Parameters: spark : Spark Session input_data : Input json files location in S3 bucket output_data : Parquet format stored in S3 """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays actions_df = df.filter(df.page == 'NextSong') actions_df.printSchema() # extract columns for users table users_table = actions_df.select(actions_df.userId, actions_df.firstName, actions_df.lastName, actions_df.gender, actions_df.level).dropDuplicates() # write users table to parquet files users_table = users_table.write.mode('overwrite').parquet("/users.parquet") # # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) actions_df = actions_df.withColumn("timestamp", get_timestamp(actions_df.ts)) print("creating timestamp column...") actions_df.printSchema() # # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000))) actions_df = actions_df.withColumn("datetime", get_datetime(actions_df.ts)) print("creating datetime column...") actions_df.printSchema() # extract columns to create time table time_table = actions_df.select( col('datetime').alias('start_time'), hour(col('datetime')).alias('hour'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), date_format(col('datetime'), "u").alias('weekday')).dropDuplicates() print("creating time_table...") time_table.printSchema() # write time table to parquet files partitioned by year and month time_table = time_table.write.partitionBy( "year", "month").mode('overwrite').parquet("/time_table.parquet") # read in song data to use for songplays table song_data = input_data + 'song_data/*/*/*/*.json' song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table complete_df = song_df.join(actions_df, song_df.title == actions_df.song, "inner") songplays_table = complete_df.select( col('datetime').alias('start_time'), col('userId').alias('userId'), col('level').alias('level'), col('song_id').alias('songId'), col('artist_id').alias('artistId'), col('sessionId').alias('sessionId'), col('location').alias('location'), col('userAgent').alias('user_agent'), year(col('datetime')).alias('year'), month(col('datetime')).alias('month'), ).withColumn('songplay_id', monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table = songplays_table.write.partitionBy( "year", "month").mode('overwrite').parquet("/songplays_table.parquet")
def process_log_data(spark, input_data, output_data): """ This function loads log data into S3 after having retrieved the data from S3. """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_columns = [ "userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level" ] users_table = df.selectExpr(users_columns).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/') # create timestamp column from original timestamp column get_timestamp = udf(date_convert, TimestampType()) df = df.withColumn("start_time", get_datetime('ts')) #songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') # extract columns to create time table time_table = df.select("start_time").dropDuplicates() \ .withColumn("hour", hour(col("start_time"))).withColumn("day", day(col("start_time"))) \ .withColumn("week", week(col("start_time"))).withColumn("month", month(col("start_time"))) \ .withColumn("year", year(col("start_time"))).withColumn("weekday", date_format(col("start_time"), 'E')) # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time/") # read in song data to use for songplays table df_song = spark.read.parquet(output_data + 'songs/*/*/*') df_artists = spark.read.parquet(output_data + 'artists/*') songs_logs = df.join(songs_df, (df.song == songs_df.title)) artists_songs_logs = songs_logs.join( df_artists, (songs_logs.artist == df_artists.name)) # extract columns from joined song and log datasets to create songplays table songplays = artists_songs_logs.join( time_table, artists_songs_logs.ts == time_table.start_time, 'left').drop(artists_songs_logs.year) songplays_table = songplays.select( col('start_time').alias('start_time'), col('userId').alias('user_id'), col('level').alias('level'), col('song_id').alias('song_id'), col('artist_id').alias('artist_id'), col('sessionId').alias('session_id'), col('location').alias('location'), col('userAgent').alias('user_agent'), col('year').alias('year'), col('month').alias('month'), ).repartition("year", "month") # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream kerberos = 'public2.alerts.ztf' in args.servers df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False, kerberos=kerberos) # Get Schema of alerts alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) if '134.158.' in args.servers or 'localhost' in args.servers: # using custom from_avro (not available for Spark 2.4.x) # it will be available from Spark 3.0 though df_decoded = df.select( [from_avro(df["value"], alert_schema_json).alias("decoded")]) elif 'public2.alerts.ztf' in args.servers: # Decode on-the-fly using fastavro f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema) df_decoded = df.select([f(df['value']).alias("decoded")]) else: msg = "Data source {} is not known - a decoder must be set".format( args.servers) logger.warn(msg) spark.stop() # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("year", "month", "day") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def process_sdf(sdf_drive, sdf_vehicle): sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"]) sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0) sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \ (F.date_sub(F.next_day( F.from_utc_timestamp(F.col("datetime"), "America/New_York"), 'monday'), 7))) sdf_Active_horsepower = sdf_drive_start_of_week.withColumn("Active_horsepower" , (F.col("eng_load") / 255) \ * (F.col("max_torque") * F.col("rpm")) / 5252) # Horsepower utilization – Active horsepower / Max Horsepower sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower")) # # Torque Utilization - calculated as Engine load/ 255 sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255) # # RPM Utilization – RPM / Maximum horsepower rpm sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") ) sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s", F.when((F.col("Torque_Utilization") >= 0.6) \ & (F.col("Torque_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \ & (F.col("Torque_Utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \ & (F.col("Torque_Utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \ & (F.col("Torque_Utilization") < 1), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \ & (F.col("Horsepower_utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \ & (F.col("Horsepower_utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \ & (F.col("Horsepower_utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \ & (F.col("Horsepower_utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \ & (F.col("RPM_Utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \ & (F.col("RPM_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ \ sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \ .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \ F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \ F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \ F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \ F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \ F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \ F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \ F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \ F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \ F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), ) sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date", F.date_format(F.col("week_start_date"), "yyyy-MM-dd")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0) return sdf_sdf_engine_features_final
from pyspark.sql.functions import row_number from pyspark.sql.functions import monotonically_increasing_id import pandas as pandas sc = SparkContext('local') spark = SparkSession(sc) #Original Data df = spark.read.csv("data.csv", header=True, inferSchema=True) df.show() #Changing Date format and Renaming a column df = df.select( 'Input Data', date_format( unix_timestamp("Date", "yyyy-MM-dd").cast("timestamp"), "dd-MM-yyyy").alias('Date'), 'Type', 'Value').withColumnRenamed('Input Data', 'Output Data') df.show() #Displaying selected columns df1 = df.select('Output Data', 'Date', 'Type') df1.show() #Displaying selected columns df2 = df.select('Output Data', 'Date', 'Value') df2.show() #Inserting a new column with the string value 'Type' in df1 df1 = df1.withColumn('Variable', F.lit('Type')) df1.show()
def process_log_data(spark, input_data, output_data): ''' Processes log_data files from an S3, extracting the user, time and songplays tables. Outputs a compressed parquet file for each table. spark : Spark session input_data : S3 path for log_data output_data: S3 bucket path were tables will be stored in parquet format ''' # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # log_data = input_data + 'log_data/' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong').filter(df.userId.isNotNull()) # extract columns for users table users_table = df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level').distinct() # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users.parquet') # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: str(int(int(ts) / 1000))) df = df.withColumn('timestamp', get_timestamp(col('ts'))) # create datetime column from original timestamp column get_datetime = udf(lambda dt: str(datetime.fromtimestamp(int(dt) / 1000))) df = df.withColumn('datetime', get_datetime(col('ts'))) # extract columns to create time table time_table = df.select('timestamp', hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'E').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'time.parquet') # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs.parquet') # extract columns from joined song and log datasets to create songplays table ts_Format = 'yyyy/MM/dd HH:MM:ss z' songplays_table = song_df.join(df, song_df.artist_id == df.artist)\ .withColumn('songplay_id', monotonically_increasing_id())\ .withColumn('start_time', to_timestamp(date_format((col('ts') / 1000)\ .cast(dataType = TimestampType()), ts_Format), ts_Format))\ .select( 'songplay_id', 'start_time', 'level', 'song_id', 'artist_id', 'userAgent', 'location', col('userId').alias('user_id'), col('sessionId').alias('session_id'), month(col('start_time')).alias('month'), year(col('start_time')).alias('year') ) # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy( 'year', 'month').parquet(output_data + 'songplays.parquet') print('Ok processing log_data')
# initialize spark-session spark = initialize_spark_session() JDBC_URL = args.jdbc_uri TABLE_SINK = args.table_sink OUTPUT_PATH = args.output + "/dimension_date/" TMP_DIR = args.tmp df = spark.sql("""SELECT * FROM stag_immigration""") # get date range df = df.select("arrival_date").distinct().orderBy("arrival_date") # generate columns df = df.withColumn("year", F.date_format("arrival_date", "y")) \ .withColumn("month", F.date_format("arrival_date", "M")) \ .withColumn("day", F.date_format("arrival_date", "d")) \ .withColumn("month_string", F.date_format("arrival_date", "MMM")) \ .withColumn("day_string", F.date_format("arrival_date", "E")) \ .withColumn("week", F.date_format("arrival_date", "w")) \ .withColumn("day_of_year", F.dayofyear("arrival_date")) \ .withColumn("day_of_week", F.dayofweek("arrival_date")) \ .withColumn("quarter", F.quarter("arrival_date")) # create unique identifier df = df.withColumn("id", F.monotonically_increasing_id() + 1) # select relevant columns df = df.select("id", "arrival_date", "year", "month", "day", "month_string", "day_string", "week", "day_of_year",
#!/usr/bin/env python3 # -*- coding:utf-8 -*- # datetime:2020/3/27 9:17 from delta.tables import * from pyspark.sql.functions import * from pyspark.sql import functions as f spark=SparkSession.builder.getOrCreate() df=spark.range(5)\ .withColumn("date",f.date_format(f.current_timestamp(),"yyyyMMdd HHmmss"))\ .withColumn("value",f.lit("1")) path="e://test//delta//test" #创建delta表 # df.write.format("delta").save(path) #创建delta分区表 df.write.format("delta").partitionBy("date").save(path) #读取delta表 spark.read.format("delta").load(path) #读取时指定版本,或时间戳,不指定默认最新 spark.read.format("delta").option("timestampAsOf", '2020-03-27').load(path) spark.read.format("delta").option("versionAsOf", 1).load(path) #可以通过delta的api查看表的版本 DeltaTable.forPath(path).history().show()
# COMMAND ---------- spark.sql(""" SELECT * FROM customer_purchases ORDER BY `sum(total_cost)` DESC """)\ .show(5) # COMMAND ---------- from pyspark.sql.functions import date_format, col preppedDataFrame = staticDataFrame\ .na.fill(0)\ .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\ .coalesce(5) # COMMAND ---------- trainDataFrame = preppedDataFrame\ .where("InvoiceDate < '2011-07-01'") testDataFrame = preppedDataFrame\ .where("InvoiceDate >= '2011-07-01'") # COMMAND ---------- from pyspark.ml.feature import StringIndexer indexer = StringIndexer()\
logger = sc._jvm.org.apache.log4j logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL) sqlContext = SQLContext(sc) u_employee_trans = udf( employee_trans ) # Load the data url_String='jdbc:oracle:thin:apps/[email protected]:1524/TESTDEV' if (db_location=='PROD'): url_String='jdbc:oracle:thin:apps/[email protected]:1524/PROD' tblname="(select * from (select v.legal_entity_id as org_id, org.name as org_name, v.dept_id as dept_id, dept.name as dept_name, v.emp_id as emp_id, emp.emp_name, emp.employee_number as emp_number, v.sub_hours, f.ferial_name, v.leave_date from narl_leave_detail_info_v v, narl_leave_main m, narl_ferial_header f, narl_login_emp_info_hist_v emp, HR_ALL_ORGANIZATION_UNITS org, HR_ALL_ORGANIZATION_UNITS dept where v.leave_id=m.leave_id and m.ferial_code=f.ferial_code and v.emp_id=emp.employee_id and v.legal_entity_id=org.ORGANIZATION_ID and v.dept_id=dept.ORGANIZATION_ID and v.status in ('APPROVE','INPROCESS','PROCESSING','FREE') and TO_CHAR(v.leave_date,'YYYY')='%s') ORDER BY org_id, dept_id, emp_id) tmp" %strYear df= sqlContext.read.format('jdbc').options(url=url_String, dbtable=tblname).load() #oracle 取出值,其欄位都是大寫 df = df.select(df.ORG_ID.cast('int').alias('org_id'),df.ORG_NAME.alias('org_name'), df.DEPT_ID.cast('int').alias('dept_id'), df.DEPT_NAME.alias('dept_name'), df.EMP_ID.cast('int').alias('emp_id'), df.EMP_NAME.alias('emp_name'),df.EMP_NUMBER.alias('emp_number'),date_format(df.LEAVE_DATE, 'E').alias('name_day'),concat(lit('Day_'),date_format(df.LEAVE_DATE,'dd')).alias('day_month') ,df.FERIAL_NAME.alias('ferial_name'), df.SUB_HOURS.cast('int').alias('sub_hours')) df = df.withColumn( 'employee_num', u_employee_trans('emp_number') ).drop('emp_number') df = df.withColumnRenamed("employee_num", "emp_number") df.cache() #Load org by WEEK data--start print 'start Load org by WEEK data>>', datetime.datetime.now() df_groupBy_org_name_day = df.select('org_id','org_name','name_day','ferial_name', 'sub_hours').groupBy('org_id','org_name','name_day').pivot("ferial_name",['特別休假','加班或假日出差轉補休','生理假','傷病假','婚假', '家庭照顧假','事假', '產檢假','陪產假','產假','喪假','國內公假','國外公假','公傷病假','安胎假']).sum('sub_hours') df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0) df_groupBy_org_name_day = df_groupBy_org_name_day.select('org_id','org_name','name_day', '特別休假','加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假', '事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假').groupBy('org_id','org_name').pivot("name_day", ['Mon', 'Tue', 'Wed','Thu', 'Fri', 'Sat','Sun']).sum('特別休假', '加班或假日出差轉補休','生理假','傷病假','婚假','家庭照顧假','事假','產檢假', '陪產假','產假','喪假','國內公假','國外公假','公傷病假', '安胎假') df_groupBy_org_name_day=df_groupBy_org_name_day.fillna(0) df_groupBy_org_name_day=df_groupBy_org_name_day.orderBy(df_groupBy_org_name_day.org_id)