def add_solar_features(df): return \ (df .withColumn('declination_angle', radians(-23.45 * cos(((2 * pi)/365) * (dayofyear('date') + 10)))) .withColumn('diff_local_time_UTC', timezone_from_date('date')) .withColumn('d', (2 * pi * dayofyear('date')) / 365) .withColumn('equation_of_time', -7.655 * sin(col('d')) + 9.873 * sin(2 * col('d') + 3.588)) .drop('d') .withColumn('time_correction', 4 * (col('loc_long') - (15 * col('diff_local_time_UTC'))) + col('equation_of_time')) .withColumn('local_solar_hour', col('hour') + 0.5 + col('time_correction') / 60) .withColumn('hour_angle', 0.2618 * (col('local_solar_hour') - 12)) .drop('diff_local_time_UTC', 'equation_of_time', 'time_correction', 'local_solar_hour') .withColumn('solar_elevation', degrees(asin(sin('declination_angle') * sin(radians('loc_lat')) + cos('declination_angle') * cos(radians('loc_lat')) * cos('hour_angle')))) .drop('declination_angle', 'hour_angle'))
def add_solar_features(df): return (df.withColumn( "declination_angle", radians(-23.45 * cos(((2 * pi) / 365) * (dayofyear("date") + 10))), ).withColumn("diff_local_time_UTC", timezone_from_date("date")).withColumn( "d", (2 * pi * dayofyear("date")) / 365).withColumn( "equation_of_time", -7.655 * sin(col("d")) + 9.873 * sin(2 * col("d") + 3.588), ).drop("d").withColumn( "time_correction", 4 * (col("loc_long") - (15 * col("diff_local_time_UTC"))) + col("equation_of_time"), ).withColumn( "local_solar_hour", col("hour") + 0.5 + col("time_correction") / 60).withColumn( "hour_angle", 0.2618 * (col("local_solar_hour") - 12)).drop( "diff_local_time_UTC", "equation_of_time", "time_correction", "local_solar_hour", ).withColumn( "solar_elevation", degrees( asin( sin("declination_angle") * sin(radians("loc_lat")) + cos("declination_angle") * cos(radians("loc_lat")) * cos("hour_angle"))), ).drop("declination_angle", "hour_angle"))
def createDimDate(df): ''' Creates date dimensional table from DateTime field in upstream dataframe :param df: :return: Date Dimensional Dataframe ''' df = df.withColumn("rawKey", F.col('rawKey')) df = df.withColumn("year", F.year(F.col('DateTime'))) df = df.withColumn("month", F.month(F.col('DateTime'))) df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime'))) df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime'))) df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime'))) df = df.withColumn("hour", F.hour(F.col('DateTime'))) df = df.withColumn("minute", F.minute(F.col('DateTime'))) df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm")) df = df.withColumn("quarter", F.quarter(F.col('DateTime'))) df = df.withColumn("date", F.to_date(F.col('DateTime'))) df.createOrReplaceTempView('tempDimDateTable') dimDateDF = spark.sql(" SELECT * FROM \ (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ from tempDimDateTable \ group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ order by dateMinute ASC) \ ") # Generating dateKey field dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1) # Creating dataframe including date field which will help to generate Fact table factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute')) # Dropping unnecessary rawKey field dimDateDF = dimDateDF.drop(F.col('rawKey')) return dimDateDF, factHelperDateDF
def _extract_dim_date(self, imgDF: DataFrame, output_path: str): print('Extracting dim_date from immigration data...') dateCols = [col for col in imgDF.columns if col.endswith('_date')] dates: DataFrame = None for dateCol in dateCols: dt = imgDF. \ where( F.col(dateCol).isNotNull() ). \ select( F.col(dateCol).alias('date'), F.year(F.col(dateCol)).cast('smallint').alias('year'), F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'), F.month(F.col(dateCol)).cast('smallint').alias('month'), F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'), F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'), F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'), F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year') ) if dates: dates = dates.union(dt).distinct() else: dates = dt print('Saving dim_date') self._write_data(dates, output_path, 'date') print('Finished saving dim_date') print('Finished extracting dim_date from immigration data.')
def process_dim_time(spark): """ Creates the dimension time and stores it to stage 2. Creates a time sequence with a daily interval. spark: spark session """ #create a daily time sequence starting at 2019-10-01 ending now() #src: https://stackoverflow.com/questions/43141671/ # sparksql-on-pyspark-how-to-generate-time-series df_dim_time = spark.sql("""SELECT sequence( to_date('2019-10-01'), now(), interval 1 day) as date""")\ .withColumn("date", explode(col("date"))) #create additional columns e.g. weekofyear df_dim_time = df_dim_time.withColumn("month",F.month("date"))\ .withColumn("year",F.year("date"))\ .withColumn("dayofmonth",F.dayofmonth("date"))\ .withColumn("dayofyear",F.dayofyear("date"))\ .withColumn("weekofyear",F.weekofyear("date")) #write as dimTime to stage2 df_dim_time.write.mode('overwrite').parquet(folder_s2 + 'dimTime.parquet') return df_dim_time
def get_dayofyear_avg_v(self): data = self.data day = data.withColumn('day', functions.dayofyear(data['timestamp'])) day_avg = day.groupby('day').avg('Volume BTC') result = day_avg.select('day', 'avg(Volume BTC)').orderBy('day') result.cache() return get_label_value(result.collect())
def generate_calendar(start_date, end_date): try: import pandas as pd df = pd.DataFrame({'DATE': pd.date_range(start_date, end_date)}) from pyspark.sql.functions import year, month, date_format, expr, dayofyear, dayofmonth df_spark = spark.createDataFrame(df) df_spark = df_spark.withColumn('DATE_ID', expr("REPLACE(DATE,'-','') DATE")) df_spark = df_spark.withColumn('DATE_ID', expr("CAST(REPLACE(CAST(DATE AS DATE),'-','') AS INTEGER)")) \ .withColumn('DATE_COL', expr("CAST(DATE AS DATE)")) \ .withColumn('YEAR', year('DATE')) \ .withColumn('MONTH', month('DATE')) \ .withColumn('DAY', dayofmonth('DATE')) \ .withColumn('DAY_OF_MONTH', dayofmonth('DATE')) \ .withColumn('DAY_OF_YEAR', dayofyear('DATE')) \ .withColumn('DAY_NAME', date_format('DATE', 'E')) \ .drop('DATE') return df_spark except Exception as e: logging.error('Failed to generate CALENDAR data') raise Exception(f'Failed to generate CALENDAR data,{e}')
def transform_i94_data(self): #Read the Immigration dataset in SAS format i94_df = self.spark.read.format('com.github.saurfang.sas.spark').load(self.load_path + config['LOAD']['I94_DATA'] ) #drop the columns that are not required for analysis i94_drop_cols = ['count','visapost','occup','matflag','biryear','insnum'] i94_df = i94_df.drop(*i94_drop_cols) #convert the numeric columns to Integer datatype int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'arrdate', 'i94mode','depdate', 'i94bir', 'i94visa'] i94_df = fn.cast_type(i94_df,dict(zip(int_cols,len(int_cols)*[IntegerType()]))) #convert the SAS date format to String format date_cols = ['arrdate','depdate'] for c in date_cols: i94_df = i94_df.withColumn(c,fn.sas_to_date_udf(i94_df[c])) #Create dates dataframe to be stored as a separate file arrdate = i94_df.select('arrdate').distinct() depdate = i94_df.select('depdate').distinct() date_df = arrdate.union(depdate) date_df = date_df.withColumnRenamed('arrdate','date') date_df = date_df.withColumn('year',F.year(date_df.date)) \ .withColumn('month',F.month(date_df.date)) \ .withColumn('day',F.dayofmonth(date_df.date)) \ .withColumn('dayofweek',F.dayofweek(date_df.date)) \ .withColumn('dayofyear',F.dayofyear(date_df.date)) #Write the dates dataframe to a file in S3 in parquet format date_df.write.mode('overwrite').parquet(self.save_path + date_save_path) #Write the I94 dataframe to a file in S3 in parquet format partioned by year and month i94_df.write.partitionBy('i94yr','i94mon').mode('overwrite').parquet(self.save_path + immigration_save_path)
def main_A(inputs): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) df = sql_trans.transform(data) ############################################################################# df = data.withColumn('day_of_year', fn.dayofyear('date')) df = df.withColumn('year', fn.year('date')) df_long_lat = df[['station', 'longitude', 'latitude', 'tmax', 'year']].toPandas() count_year = df_long_lat['year'].value_counts().to_dict() # SELECT YEAR and DURATION YEAR_SELECTED = 2000 YEAR_DURATION = 20 df_long_lat = df_long_lat.loc[(df_long_lat['year'] > YEAR_SELECTED) & ( df_long_lat['year'] < YEAR_SELECTED + YEAR_DURATION)] # UNCLUSTER plot by finding avg temperature (groupby same station and year) df_long_lat['avg_temp'] = df_long_lat.groupby(['station', 'year' ])['tmax'].transform('mean') df_long_lat.drop_duplicates(subset=['station', 'year'], inplace=True) print(df_long_lat) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) geometry = [ Point(xy) for xy in zip(df_long_lat['longitude'], df_long_lat['latitude']) ] df_long_lat = df_long_lat.drop(['longitude', 'latitude'], axis=1) crs = {'init': 'epsg:4326'} gdf = GeoDataFrame(df_long_lat, crs=crs, geometry=geometry) base = world.plot(color='white', edgecolor='black', figsize=(20, 12)) gdf.plot(column='avg_temp', ax=base, marker='o', cmap='jet', markersize=15, legend=True, legend_kwds={ 'label': "Temperature in Celcius", 'orientation': "horizontal" }) plt.title('Distribution of Temperature between ' + str(YEAR_SELECTED) + " and " + str(YEAR_SELECTED + YEAR_DURATION)) plt.savefig(inputs + "_" + str(YEAR_SELECTED) + "-" + str(YEAR_SELECTED + YEAR_DURATION))
def process_trip_data(spark, input_data, output_data): trip_data = os.path.join(input_data, 'trip_data/*.csv') df = spark.read.csv(trip_data, header=True) df = df.where(col("start_station_id").isNotNull()) \ .where(col("end_station_id").isNotNull()) \ .select(['duration_sec', 'start_time', 'end_time', 'start_station_id', 'end_station_id', 'bike_id', 'user_type']) df.show(10) # Create start_time_table start_time_table = df.select(["start_time"]) \ .withColumn("start_year", year("start_time")) \ .withColumn("start_month", month("start_time")) \ .withColumn("start_dayofyear", dayofyear("start_time")) \ .withColumn("start_dayofmonth", dayofmonth("start_time")) \ .withColumn("start_week", weekofyear("start_time")) \ .withColumn("start_dayofweek", dayofweek("start_time")) \ .withColumn("start_hour", hour("start_time")) start_time_table.show(5) # Create end_time_table end_time_table = df.select(["end_time"]) \ .withColumn("end_year", year("end_time")) \ .withColumn("end_month", month("end_time")) \ .withColumn("end_dayofyear", dayofyear("end_time")) \ .withColumn("end_dayofmonth", dayofmonth("end_time")) \ .withColumn("end_week", weekofyear("end_time")) \ .withColumn("end_dayofweek", dayofweek("end_time")) \ .withColumn("end_hour", hour("end_time")) end_time_table.show(5) # Write trip talbe df.write.partitionBy(['start_station_id']).parquet(os.path.join(output_data, 'trips'), 'overwrite') # Write start time table start_time_table.write.partitionBy(['start_year', 'start_month']).parquet(os.path.join(output_data, 'start_time'), 'overwrite') # Write end time table end_time_table.write.partitionBy(['end_year', 'end_month']).parquet(os.path.join(output_data, 'end_time'), 'overwrite')
def add_date_features(samples): samples = add_cyclic_feature(samples, dayofmonth("date"), "dayofmonth", 31) samples = add_cyclic_feature(samples, dayofyear("date"), "dayofyear", 366) samples = samples.withColumn("dayofweek", dayofweek("date")) samples = samples.withColumn("hour2", col("hour")) samples = add_cyclic_feature(samples, col("hour"), "hour", 24) samples = samples.withColumnRenamed("hour2", "hour") return samples
def add_date_features(samples): samples = add_cyclic_feature(samples, dayofmonth('date'), 'dayofmonth', 31) samples = add_cyclic_feature(samples, dayofyear('date'), 'dayofyear', 366) samples = samples.withColumn('dayofweek', dayofweek('date')) samples = samples.withColumn('hour2', col('hour')) samples = add_cyclic_feature(samples, col('hour'), 'hour', 24) samples = samples.withColumnRenamed('hour2', 'hour') return samples
def num_days(self) -> int: """ Number of distinct days in the ezlink. """ if self._num_days: return self._num_days self._num_days = (self.dataframe .select(self.year, F.dayofyear(self.tap_in_time).alias('day')) .distinct() .count()) return self._num_days
def _transform(self, df): input = self.getInputCol() df = df.withColumn("dt_day", F.dayofmonth(input)) df = df.withColumn("dt_hour", F.hour(input)) df = df.withColumn("dt_minute", F.minute(input)) df = df.withColumn("dt_second", F.second(input)) df = df.withColumn("dt_dayofyear", F.dayofyear(input)) df = df.withColumn("dt_dayofweek", F.dayofweek(input)) df = df.withColumn("dt_weekofyear", F.weekofyear(input)) return df
def main(inputs, model_file): data = spark.read.option('encoding', 'UTF-8').csv(inputs, schema=tmax_schema) ################ FEATURE ENGINEERING: add yesterday tmax ##################### if USE_YTD_TEMP_FEATURE: syntax = """SELECT today.latitude,today.longitude,today.elevation,today.date, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" sql_trans = SQLTransformer(statement=syntax) data = sql_trans.transform(data) ############################################################################# data = data.withColumn('day_of_year', fn.dayofyear('date')) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() if USE_YTD_TEMP_FEATURE: train_feature_assembler = VectorAssembler(inputCols=[ 'yesterday_tmax', 'day_of_year', 'latitude', 'longitude', 'elevation' ], outputCol='features') else: train_feature_assembler = VectorAssembler( inputCols=['day_of_year', 'latitude', 'longitude', 'elevation'], outputCol='features') ############# DIFFERENT ML ALGORITHMS TO BE USED #################### # classifier = GeneralizedLinearRegression(featuresCol = 'features', labelCol='tmax' ) # classifier = GBTRegressor( maxDepth=5,featuresCol = 'features', labelCol='tmax' ) classifier = RandomForestRegressor(numTrees=7, maxDepth=8, featuresCol='features', labelCol='tmax') ##################################################################### train_pipeline = Pipeline(stages=[train_feature_assembler, classifier]) weather_model = train_pipeline.fit(train) prediction = weather_model.transform(validation) # print(prediction.show()) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol='tmax', metricName='r2') #rmse score = evaluator.evaluate(prediction) print('Validation score for weather model: %g' % (score, )) weather_model.write().overwrite().save(model_file)
def extract_date(df): #df = df.na.fill('NORMAL',subset=['types']) df = df.dropna(how='all') df = (df .withColumn('Yearday', F.dayofyear(F.col("SalDate"))) .withColumn('Month', F.month(F.col('SalDate'))) .withColumn('DayofWeek', F.dayofweek(F.col('SalDate'))) .withColumn('Year', F.year(F.col('SalDate'))) .withColumn('Quarter', F.quarter(F.col('SalDate'))) .withColumn('WeekOfYear', F.weekofyear(F.col('SalDate'))) .withColumn('MonthQuarter', F.when((df['DayofMonth'] <= 8), 0)\ .otherwise(F.when((df['DayofMonth'] <= 16), 1).otherwise(F.when((df['DayofMonth'] <= 24), 2)\ .otherwise(3)))) ) df = df.cache() return df
def compare_calories_across_week(self, start_date, days_through_week): """ This method compares calories burned between the start day's week & the week before. :param start_date: %Y-%m-%d string, date to base comparison on. :param days_through_week: int, days into current week to compare against There will always be 7 days of data for the last week, but when the current week is in progress, we need to tell the method how many days worth of last week's data to compare against so that it'll be a 1:1 comparison. mon: 1, tues: 2, weds: 3, thurs:4, fri: 5, sat: 6 0 is not a valid input, because there are no completed days of calories to compare against before a day in that week ends. i.e., there is no total Sunday data to compare against last Sunday until it's Monday. :return: tuple containing the summed calories for the week of the start_date & the week preceding. """ spark_session = create_spark_session() df = get_df_from_db(database=self.config['database'], schema='activity', table='calories', spark_session=spark_session) df = df.withColumn('day_of_year', F.dayofyear(df.date)) start_date_datetime = datetime.datetime.strptime( start_date, '%Y-%m-%d') first_week_df = df.where( df.date.between( start_date, start_date_datetime + timedelta(days_through_week - 1))) next_week_df = df.where( df.date.between( start_date_datetime + timedelta(days=7), start_date_datetime + timedelta(days=7) + timedelta(days=days_through_week - 1))) first_week_total = first_week_df.select(F.sum( first_week_df.total)).collect()[0][0] next_week_total = next_week_df.select(F.sum( next_week_df.total)).collect()[0][0] print( "You had burned {} calories at this point during the week of {}, compared to {} the following week." .format(first_week_total, start_date, next_week_total)) return next_week_total, first_week_total
def process_log_data(spark, input_data, output_data): """ Description: This function can be used to process log data to get tables we want ant and save these tables to files. Arguments: spark: the spark session object. input_data: input data path out_data: output data path Returns: None """ staging_logs = spark.read.json(input_data, schema=staging_events_schema) print("create tempTable staging_events") staging_logs.registerTempTable("staging_events") users_table = spark.sql(select_users) users_table.registerTempTable("users") print("create users", ) users_table.write.parquet('./output/users/') print_show(users_table) time_ = spark.sql(select_time) time_table = time_.select('start_time', hour('start_time').alias('hour'), dayofyear('start_time').alias('day'), weekofyear('start_time').alias('week'), month('start_time').alias('month'), year('start_time').alias('year'), dayofweek('start_time').alias('weekday')) time_table.write.partitionBy(['year', 'month']).parquet('./output/time/') print("create time") print_show(time_table) songplays_table = spark.sql(select_songplays) songplays_table = songplays_table.withColumn( 'year', year(songplays_table.start_time)) songplays_table = songplays_table.withColumn( 'month', month(songplays_table.start_time)) songplays_table.write.partitionBy(['year', 'month']).parquet('./output/songplays/') print("read song_plays") print_show(songplays_table)
def log_emission_summary(emis): # Add ME and AE emis = emis\ .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\ .withColumn('nox', col('nox_me') + col('nox_ae'))\ .withColumn('sox', col('sox_me') + col('sox_ae'))\ .withColumn('co2', col('co2_me') + col('co2_ae')) # Log total all = emis.agg( sum(col('trans_p')).alias('total_trans_p'), sum(col('trans_p_me')).alias('total_trans_p_me'), sum(col('trans_p_ae')).alias('total_trans_p_ae'), sum(col('nox')).alias('total_nox'), sum(col('sox')).alias('total_sox'), sum(col('co2')).alias('total_co2'), sum(col('nox_me')).alias('total_nox_me'), sum(col('sox_me')).alias('total_sox_me'), sum(col('co2_ae')).alias('total_co2_me'), sum(col('nox_ae')).alias('total_nox_ae'), sum(col('sox_ae')).alias('total_sox_ae'), sum(col('co2_ae')).alias('total_co2_ae')).toPandas() mlflow.log_metrics(all.iloc[0].to_dict()) # Generate time features emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType())) emis = emis\ .withColumn('day', dayofyear(col('time')))\ .withColumn('week', weekofyear(col('time')))\ .withColumn('month', month(col('time')))\ .withColumn('dayofweek', dayofweek(col('time'))).cache() day_df = group_emis(emis, 'day') week_df = group_emis(emis, 'week') month_df = group_emis(emis, 'month') dayofweek_df = group_emis(emis, 'dayofweek') prt_high("Generated summary. Logging it.") # Log everything log_dataframe_metric(day_df, 'day_day') log_dataframe_metric(week_df, 'week_week') log_dataframe_metric(month_df, 'month_month') log_dataframe_metric(dayofweek_df, 'dayofweek_dayofweek')
def create_features_from_transaction_timestamp(data): """Extraction of transact timestamp. New features are created based on. Features: - Hour: hour time transaction - DayOfWeek: day of week transaction - DayOfYear: day of year transaction - WeekOfYear: week of year transaction Args: data (spark dataframe): input spark data frame. """ utils.save_log('{0} :: {1}'.format( create_features_from_transaction_timestamp.__module__, create_features_from_transaction_timestamp.__name__)) data = data.withColumn('TransactionHour', hour(data[config.feature_column_timestamp])) data = data.withColumn('TransactionDayOfWeek', dayofweek(data[config.feature_column_timestamp])) data = data.withColumn('TransactionDayOfYear', dayofyear(data[config.feature_column_timestamp])) data = data.withColumn('TransactionWeekOfYear', weekofyear(data[config.feature_column_timestamp])) data = data.withColumn( 'WeekAction', when(col('TransactionWeekOfYear').between(50, 52), 1).otherwise(0)) update_list_features("numerical", [ 'TransactionHour', 'TransactionDayOfWeek', 'TransactionDayOfYear', 'TransactionWeekOfYear', 'WeekAction' ]) return data
def add_date( df: DataFrame, date: str ) -> DataFrame: """ Creates three columns 'year', 'month', 'day' with 1 for january, 2 for february, 3 for march, 4 for april, 5 for may, 6 for june, 7 for july, 8 for august, 9 for september, 10 for october, 11 for november, 12 for december Attributes: ----------- df: pyspark.sql.DataFrame DataFrame of interest date: String Column with DateType :return: pyspark.sql.DataFrame """ return df.select( df["*"], year(date).alias('year'), (month(date)-1).alias('month'), (dayofmonth(date)-1).alias('day'), (dayofyear(date)-1).alias('dayofyear') )
def test_dayofyear(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).select(f.dayofyear(f.col('a'))))
-9, 0) # timestamp of each interval is the superior limit by guidelines df_time_ma = df_time.select("asset", "variable", "timestamp", avg("value").over(win).alias("ma")).cache() # COMMAND ---------- # visualize that moving average performed well import matplotlib.pyplot as plt from pyspark.sql.functions import dayofyear, hour the_asset = df_time_ma.select('asset').distinct().take(1)[0].asset the_variable = df_time_ma.select('variable').distinct().take(1)[0].variable the_day = (sorted([ x.dayofyear for x in (df_time_ma.select( dayofyear('timestamp').alias('dayofyear')).distinct().take(5)) ]))[0] # min doesn't work on javalist the_hours = 12 the_title = ("Data for asset {}, variable {} for day {}, {} hours".format( the_asset, the_variable, the_day, the_hours)) # currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter( df_time_ma.variable == the_variable).filter( dayofyear('timestamp') == the_day).filter( hour('timestamp') <= the_hours).cache() test_df_1s = test_df.toPandas() test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas() test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter( second(df_time_ma.timestamp) == 0).toPandas()
def process_log_data(spark, input_data, output_data): ''' Transform log data to songs and artists tables ''' # get filepath to log data file log_data = f'{input_data}log_data/*/*/*.json' # log_data = f'{input_data}log_data/2018/11/*.json' # subset # read log data file df = spark.read.json(log_data) print(f'Read log data {log_data}') df.printSchema() # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table users_table = df.select('ts', 'userId', 'gender', 'level', col('firstName').alias('first_name'), col('lastName').alias('last_name')).drop_duplicates(subset=['userId']) # write users table to parquet files print(f'Save {output_data}users_table') users_table.show(5) users_table.write.parquet(f'{output_data}users_table', mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) time_df = df.select('ts').drop_duplicates().withColumn('timestamp', get_timestamp(df.ts)) # extract columns to create time table time_table = time_df.select('ts', 'timestamp').drop_duplicates() \ .withColumn('hour', hour(time_df.timestamp)) \ .withColumn('day', dayofyear(time_df.timestamp)) \ .withColumn('week', weekofyear(time_df.timestamp)) \ .withColumn('month', month(time_df.timestamp)) \ .withColumn('year', year(time_df.timestamp)) \ .withColumn('weekday', dayofweek(time_df.timestamp)) \ .withColumn('start_time', time_df.timestamp) \ .drop_duplicates(subset=['start_time']) # write time table to parquet files partitioned by year and month print(f'Save {output_data}time_table') time_table.show(5) time_table.write.parquet(f'{output_data}time_table', mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_data = f'{input_data}song_data/*/*/*/*.json' # song_data = f'{input_data}song_data/A/A/A/*.json' # subset song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, [df.song == song_df.title, df.artist == song_df.artist_name, df.length == song_df.duration], how='inner') \ .select(monotonically_increasing_id().alias("songplay_id"), "ts", "level", "song_id", "artist_id", "location", col("sessionId").alias("session_id"), col("userId").alias("user_id"), col("userAgent").alias("user_agent")) \ .join(time_table, df.ts == time_table.ts, how="inner") \ .select("songplay_id", "start_time", "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month") \ .drop_duplicates() # write songplays table to parquet files partitioned by year and month print(f'Save {output_data}songplays_table') songplays_table.show(5) songplays_table.write.parquet(f'{output_data}songplays_table', mode='overwrite', partitionBy=["year", "month"])
from pyspark.ml.linalg import Vectors, MatrixUDT, VectorUDT, DenseMatrix, DenseVector import time import math import pandas as pd #df = sqlContext.read.csv("/datasets/district11.csv", header='true') df = sqlContext.read.csv("/datasets/crimes.csv", header='true') start = time.time() #Define date derivatives df = (df .withColumn('Timestamps', F.to_timestamp("Date", 'MM/dd/yyyy hh:mm:ss a')) .withColumn('Day', F.to_date("Date", 'MM/dd/yyyy hh:mm:ss a')) .withColumn("Month", F.month("Day")) .withColumn("Hour", F.hour("Timestamps")) .withColumn("DayOfYear", F.dayofyear("Day")) .withColumn("DayOfWeek", F.dayofweek("Day")) ) cols = ["ID","Day","Year","Month","Hour","DayOfYear","DayOfWeek","District","Primary Type"] df = df.select(*cols) #Rename Primary Types with less than 1% share OTHER CRIMES def least_frequent_columns(df,threshold=0.01): res = df.groupBy("Primary Type").count()\ .withColumn('tot',F.lit(df.count()))\ .withColumn('frac',F.expr('count/tot'))\ .filter('frac<'+str(threshold))\
# Pickups/Dropoffs in entire NYC taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache() taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1)) taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4)) # Time features date_df = taxi_df.select(taxi_df.Time).distinct() weekday_udf = udf(lambda date_time: date_time.weekday(), IntegerType()) is_holiday_udf = udf(lambda date_time: date_time.date() in holidays.UnitedStates(), BooleanType()) date_df = date_df.withColumn('Hour', func.hour(date_df.Time)) date_df = date_df.withColumn('Day_Of_Week', weekday_udf(date_df.Time)) date_df = date_df.withColumn('Day_Of_Year', func.dayofyear(date_df.Time)) date_df = date_df.withColumn('Is_Holiday', is_holiday_udf(date_df.Time)) # Aggregate events happening in last and next 3 hours for each hour event_3h_df = event_df.withColumnRenamed('Venues', 'Venues_0h') for i in range(-3, 4): if i != 0: add_hours_udf = udf(lambda date_time: date_time + datetime.timedelta(hours=i), TimestampType()) event_3h_df = event_3h_df.join(event_df.withColumn('Time', add_hours_udf(event_df.Time)).withColumnRenamed('Venues', 'Venues_%sh' % str(i)), 'Time') # Join single feature groups features_df = taxi_df.select(index_columns + [taxi_df.Pickup_Count]) \ .join(taxi_dis_1h_df, index_columns) \ .join(taxi_dis_4h_df, index_columns) \
def _transform(self, df): self.check_input_type(df.schema) return df.withColumn(self.outputCol, F.dayofyear(df[self.inputCol]))
def extract_datetime_info(self, datetime_col, info_to_extract): self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', self.to_date_(datetime_col)) timestamped = datetime_col + "_timestamped" # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) uniqueVals = self._data_frame.select( datetime_col + '_temp').distinct().na.drop().limit(10).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', to_date_udf(self._data_frame[datetime_col + '_temp']).alias(datetime_col + '_temp')) if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass except TypeError: if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass self._data_frame = self._data_frame.drop(datetime_col + '_temp') # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy")) # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy")) return self._data_frame
from pyspark.sql.functions import countDistinct, avg, stddev df.select(countDistinct("Sales")).show() #count distinct df.select(countDistinct("Sales").alias("Distinct Sales")).show() #change alias name df.select(avg('Sales')).show() #average df.select(stddev("Sales")).show() #stdev ## DATETIME #-------------------------------------------------------- from pyspark.sql.functions import (format_number, dayofmonth, hour, dayofyear, month, year, weekofyear, date_format) df.select(dayofmonth(df['Date'])).show() #date of month df.select(hour(df['Date'])).show() #hour df.select(dayofyear(df['Date'])).show() #day of year df.select(month(df['Date'])).show() #month df.select(year(df['Date'])).show() #year ## 2) USING RDD (Resilient Distributed Dataset) # spark is transiting slowly to spark dataframe, but its stil good to learn the original parsing in RDD # especially when data is non-dataframe type #-------------------------------------------------------- from pyspark import SparkConf, SparkContext
# get sourcefile name from input_file_name() df = df.withColumn("path", fun.input_file_name()) regex_str = "[\/]([^\/]+[^\/]+)$" #regex to extract text after the last / or \ df = df.withColumn("sourcefile", fun.regexp_extract("path", regex_str, 1)) df.show() ####################################################################### # handle dates and times df = df.withColumn('timestamp', fun.to_date("timestamp")) df.show(2) # now we should be able to convert or extract date features from timestamp df.withColumn('dayofmonth', fun.dayofmonth("timestamp")).show(2) df.withColumn('month', fun.month("timestamp")).show(2) df.withColumn('year', fun.year("timestamp")).show(2) df.withColumn('dayofyear', fun.dayofyear("timestamp")).show(2) # calculate the difference from the current date ('days_ago') df.withColumn('days_ago', fun.datediff(fun.current_date(), "timestamp")).show() ######################################################################## #group_by # summarize within group data df.groupBy("sourcefile").count().show(99) df.groupBy("sourcefile").min('open').show(99) df.groupBy("sourcefile").mean('open').show(99) df.groupBy("sourcefile").max('open', 'close').show(99) ######################################################################## #window functions from pyspark.sql.window import Window
def process_log_data(spark, input_data, output_data): """ This function is to read the log data in the filepath (bucket/log_data) to get the info. to populate the users, time and song tables. Args: ------------------------------ spark: the cursor object input_data: the path to the bucket containing song data output_data: the path where the parquet files stored Returns: None """ # get filepath to log data file log_data = f'{input_data}/log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) print("Success of reading log_data from S3") # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table # users table: user_id, first_name, last_name, gender, level user_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').dropDuplicates() # write users table to parquet files user_table.write.parquet(f'{output_data}/user_table', mode='overwrite') print('Success of writing user_table to parquet') # convert ts column to timestamp df = df.withColumn('start_time', F.from_unixtime(F.cols('ts') / 1000)) # extract columns to create time table # time table: start_time, hour, day, week, month, year, weekday time_table = df.select('ts', 'start_time') \ .withColumn('year', F.year('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')) \ .withColumn('day', F.dayofyear('start_time')) \ .withColumn('hour', F.hour('start_time')).dropDuplicates() print('Success of extracting time column') # write time table to parquet files partitioned by year and month time_table.write.parquet(f'{output_table}/time_table', mode='overwrite', partitionBy=['year', 'month']) print('Success of writing time_table to parquet') # read in song data to use for songplays table song_data = f'{input_data}/song_data/A/A/A/*.json' song_dataset = spark.read.json(song_data) print('Success of reading song_dataset from S3') # create temporary view of song_dataset, time_table and log_dataset song_dataset.createOrReplaceTempView('song_dataset') time_table.createOrReplaceTempView('time_table') df.createOrReplaceTempView('log_dataset') # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql("""SELECT DISTINCT FROM song_dataset s JOIN log_dataset l ON s.title = l.song AND s.duration = l.length AND s.artist_name = l.artist JOIN time_table t ON t.ts = l.ts""").dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(f'{output_data}/songplays_table', mode='overwrite', partitionBy=['year', 'month'])