def task_a_3_step_3_final(spark): result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "popular-topics-by-country_step-2").parse_json(a3_struct_common) \ .withWatermark("timetamp_start", "1 minute").groupBy( "timetamp_start", "timetamp_end" ).agg( F.collect_list( F.create_map( [ "country_name", F.create_map( [ "topic_name_exp", "topic_sum" ] ) ] ) ).alias("statistics") ).select( F.struct( F.concat(F.hour('timetamp_start'), lit(":"), F.minute('timetamp_start')).alias("time_start"), F.concat(F.hour('timetamp_end'), lit(":"), F.minute('timetamp_end')).alias("time_end"), col('statistics') ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "popular-topics-by-country", config.LOG_PREFIX) return result
def task_a_2_step_1_final(spark): a2_struct = T.StructType([ T.StructField("datetime_start", T.TimestampType()), T.StructField("datetime_end", T.TimestampType()), T.StructField("map_topics", T.MapType( T.StringType(), T.ArrayType(T.StringType()) )) ]) result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \ .withWatermark("datetime_end", "1 minute").groupBy( F.window("datetime_end", "3 hour", "1 hour") ) \ .agg( F.first("window.start").alias("timestamp_start"), F.first("window.end").alias("timestamp_end"), F.collect_list("map_topics").alias("statistics") ) \ .select( F.struct( F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"), F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"), concat_maps_udf(col('statistics')).alias("statistics") ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX) return result
def __appendAggKey(tsdf, freq = None): """ :param tsdf: TSDF object as input :param freq: frequency at which to upsample :return: return a TSDF with a new aggregate key (called agg_key) """ df = tsdf.df checkAllowableFreq(freq) # compute timestamp columns sec_col = f.second(f.col(tsdf.ts_col)) min_col = f.minute(f.col(tsdf.ts_col)) hour_col = f.hour(f.col(tsdf.ts_col)) if (freq == SEC): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp") elif (freq == MIN): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == HR): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == DAY): agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp") df = df.withColumn("agg_key", agg_key) return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
def createDimDate(df): ''' Creates date dimensional table from DateTime field in upstream dataframe :param df: :return: Date Dimensional Dataframe ''' df = df.withColumn("rawKey", F.col('rawKey')) df = df.withColumn("year", F.year(F.col('DateTime'))) df = df.withColumn("month", F.month(F.col('DateTime'))) df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime'))) df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime'))) df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime'))) df = df.withColumn("hour", F.hour(F.col('DateTime'))) df = df.withColumn("minute", F.minute(F.col('DateTime'))) df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm")) df = df.withColumn("quarter", F.quarter(F.col('DateTime'))) df = df.withColumn("date", F.to_date(F.col('DateTime'))) df.createOrReplaceTempView('tempDimDateTable') dimDateDF = spark.sql(" SELECT * FROM \ (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ from tempDimDateTable \ group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ order by dateMinute ASC) \ ") # Generating dateKey field dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1) # Creating dataframe including date field which will help to generate Fact table factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute')) # Dropping unnecessary rawKey field dimDateDF = dimDateDF.drop(F.col('rawKey')) return dimDateDF, factHelperDateDF
def _transform(self, df): time_variable = self.getColumn() new_time_variable = time_variable + '_new' # code from tawab. Convert all times in a same format. df = df.withColumn( new_time_variable, self.udf_date_formatting()( funct.col(time_variable).cast("String"))) df = df.withColumn( new_time_variable, funct.from_unixtime( funct.unix_timestamp(new_time_variable, self.time_format)).cast(TimestampType())) df = df.withColumn(time_variable + '_year', funct.year(new_time_variable)) df = df.withColumn(time_variable + '_month', funct.month(new_time_variable)) df = df.withColumn(time_variable + '_day', funct.dayofmonth(new_time_variable)) df = df.withColumn(time_variable + '_dayofweek', funct.dayofweek(new_time_variable)) df = df.withColumn(time_variable + '_hour', funct.hour(new_time_variable)) df = df.withColumn(time_variable + '_minutes', funct.minute(new_time_variable)) df = df.withColumn(time_variable + '_seconds', funct.second(new_time_variable)) df = df.drop(new_time_variable) df = df.drop(time_variable) return df
def vwap(self, frequency='m', volume_col="volume", price_col="price"): # set pre_vwap as self or enrich with the frequency pre_vwap = self.df print('input schema: ', pre_vwap.printSchema()) if frequency == 'm': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'), f.lit(':'), f.lpad(f.minute(f.col(self.ts_col)), 2, '0'))) elif frequency == 'H': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'))) elif frequency == 'D': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.day(f.col(self.ts_col)), 2, '0'))) group_cols = ['time_group'] if self.partitionCols: group_cols.extend(self.partitionCols) vwapped = (pre_vwap.withColumn( "dllr_value", f.col(price_col) * f.col(volume_col)).groupby(group_cols).agg( sum('dllr_value').alias("dllr_value"), sum(volume_col).alias(volume_col), max(price_col).alias("_".join(["max", price_col]))).withColumn( "vwap", f.col("dllr_value") / f.col(volume_col))) return TSDF(vwapped, self.ts_col, self.partitionCols)
def get_time_to_purchase(timeframe, partner, premium, purchase): # todo """ returns the distribution of time it takes a user to achieve a purchase input: a period (string), a partner (string), 2 spark dataframes (first and last one in the workflow) output: a dictionary with the bucket and the values associated """ keys = [0, 20, 40, 60, 120, 180, 240, 300, 600] result = collections.OrderedDict() result = {key: 0 for key in keys} timeframe_is = get_date(timeframe) purchase_renam = (purchase.filter(purchase.keen.timestamp >= timeframe_is) .filter(purchase.search_info.partner_id == partner) .withColumnRenamed('keen', 'keen_purchase') .withColumnRenamed('flight', 'flight_purchase')) premium_renam = (premium.filter(premium.keen.timestamp >= timeframe_is) .filter(premium.search_info.partner_id == partner) .withColumnRenamed('keen', 'keen_premium') .withColumnRenamed('flight', 'flight_premium')) joined_df = purchase_renam.join(premium_renam, purchase_renam.search_info.search_id == premium_renam.search_info.search_id, 'inner') joined_df = joined_df.withColumn("time_to_purchase", (minute(joined_df.keen_purchase.timestamp) - minute(joined_df.keen_premium.timestamp)) * 60 + second(joined_df.keen_purchase.timestamp) - second(joined_df.keen_premium.timestamp)) times = joined_df.groupBy("time_to_purchase").sum("purchase.quantity").collect() for row in times: for i in range(len(keys) - 1): if row[0] > keys[i] and row[0] <= keys[i+1]: result[keys[i+1]] += row[1] result.pop(0) return result
def _transform(self, df): input = self.getInputCol() df = df.withColumn("dt_day", F.dayofmonth(input)) df = df.withColumn("dt_hour", F.hour(input)) df = df.withColumn("dt_minute", F.minute(input)) df = df.withColumn("dt_second", F.second(input)) df = df.withColumn("dt_dayofyear", F.dayofyear(input)) df = df.withColumn("dt_dayofweek", F.dayofweek(input)) df = df.withColumn("dt_weekofyear", F.weekofyear(input)) return df
def timestamp_to_date(data): # Generowanie szczegółowych danych dotyczących czasu na podstawie wartości timestamp data = data.withColumn("normal_type", data["timestamp"].cast(TimestampType())) godzina = data.withColumn('godzina', hour(data['normal_type']).cast(StringType())) minuta = godzina.withColumn( 'minuta', minute(godzina['normal_type']).cast(StringType())) data = minuta.withColumn( "dzien", dayofweek(minuta["normal_type"]).cast(StringType())) return data
def main(spark): df = createDataframe(spark) df.show(truncate=False) df = df.withColumn("date", F.date_format(F.col("time"), "yyyy-MM-dd HH:mm:ss.SSSS")) \ .withColumn("h", F.hour(F.col("date"))) \ .withColumn("m", F.minute(F.col("date"))) \ .withColumn("s", F.second(F.col("date"))) \ .withColumn("event", F.expr("h*3600 + m*60 +s")) \ .drop("date","h","m","s") df.show(truncate=False) inRange = F.udf(in_range, BooleanType()) df = df.withColumn("between", inRange(F.col("range"), F.col("event"))) df.show(truncate=False)
def task_2(json_parsed_df): result = json_parsed_df.withWatermark("timestamp", "1 minute").groupBy( window("timestamp", "1 minute", "1 minute") ).agg( struct( F.month('window.end').alias('month'), F.dayofmonth('window.end').alias('day_of_the_month'), F.hour('window.end').alias('hour'), F.minute('window.end').alias("minute"), F.collect_list('group_city').alias('cities') ).alias('res') ).select(F.to_json('res').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", ",".join(config.BOOTSTRAP_SERVERS)) \ .option("topic", "US-cities-every-minute") \ .option("checkpointLocation", f"{config.LOG_PREFIX}topic_2") return result
def process_task_2(df): records = df \ .withWatermark("timestamp", "1 minute") \ .groupBy( F.window("timestamp", "1 minute", "1 minute") ).agg( struct( F.month('window.end').alias('month'), F.dayofmonth('window.end').alias('day_of_the_month'), F.hour('window.end').alias('hour'), F.minute('window.end').alias("minute"), F.collect_list('group_city').alias('cities') ).alias("result") ).select(F.to_json("result").alias("value")).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", ",".join(SERVERS)) \ .option("topic", "US-cities-every-minute") return records
def preprocess(self): # drop location is 0 self.df = self.df.filter(self.df.Start_Lat != 0.0 )\ .filter(self.df.Start_Lon != 0.0)\ .filter(self.df.End_Lon != 0.0)\ .filter(self.df.End_Lat != 0.0) self.df = self.df\ .withColumn(self.datetime_columnname, F.unix_timestamp(F.col(self.datetime_columnname),"yyyy-MM-dd HH:mm:ss").cast(TimestampType()))\ .withColumn("year", F.year(F.col(self.datetime_columnname)))\ .withColumn("month", F.month(F.col(self.datetime_columnname)))\ .withColumn("day", F.dayofmonth(F.col(self.datetime_columnname)))\ .withColumn("hour", F.hour(F.col(self.datetime_columnname)))\ .withColumn("hour", F.hour(F.col(self.datetime_columnname)))\ .withColumn("minute", F.minute(F.col(self.datetime_columnname)))\ .withColumn("Date", F.to_date(F.col(self.datetime_columnname)))\ .withColumn("pickup_time", F.round(F.col("hour") + F.col("minute")/60))\ .withColumn("dayOfTheWeek", F.dayofweek(F.col("Date")))\ .withColumn("isWeekend", self.isWeekendUDF(F.col("dayOfTheWeek")))\ .withColumn("isHoliday", self.isHolidayUDF(F.col("Date")))\ .withColumn("isCashPaid", self.isHolidayUDF(F.col("Payment_Type")))
def lowest_avg_idle_user(self, df): ''' Find average idle hours for each users. And then find who is idle less then total average hour. ''' df_idle = df.drop('working_hour', 'start_time', 'end_time') # only take working hour column # Find average idle hour for each user df_avg = df_idle.groupBy('user_name').agg( sqlFun.from_unixtime( sqlFun.avg(sqlFun.unix_timestamp('idle_time')), 'hh:mm:ss').alias('avg_time')) # Convert all into hour df_avg_hours = df_avg.withColumn( 'avg_hour', (hour(df_avg['avg_time']) * 3600 + minute(df_avg['avg_time']) * 60 + second(df_avg['avg_time'])) / 3600) #calculating average hours total_avg_idle_hour = df_avg_hours.select( avg('avg_hour')).collect()[0][0] lowest_idle_users = df_avg_hours.filter( df_avg_hours['avg_hour'] < total_avg_idle_hour).select('user_name') return lowest_idle_users
def load_data( spark ): rdd = spark.read.csv( files, header='false', timestampFormat='MM/dd/yyyy HH:mm:ss', schema=schema_struct, inferSchema='false' ) #print rdd.take(1) station_time = ( rdd.groupBy([ 'station', hour("timestamp").alias("hour"), minute("timestamp").alias("minute") ]).agg( mean("totalflow").alias("flow_mean"), stddev("totalflow").alias("flow_std"), count("totalflow").alias("flow_count"), psmax("totalflow").alias("flow_max"), psmin("totalflow").alias("flow_min") ) ) df = station_time.toPandas() #print df.station.unique().shape df['flow_std_plus_mean'] = df.flow_mean + df.flow_std df['flow_std_minus_mean'] = df.flow_mean - df.flow_std df['time'] = df.apply(lambda x:time(int(x.hour),int(x.minute)),axis = 1) df.sort_values('time',inplace=True) return df
def minute(self) -> "ps.Series": """ The minutes of the datetime. """ return self._data.spark.transform(lambda c: F.minute(c).cast(LongType()))
# Using previous dataframe, extracts the year as an integer from a given date/timestamp/string. # Similar methods: month, dayofweek, minute, second # Expected: # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # | id| ts| date string| time string| date_new|year|month|dayofweek|minute|second| # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # |UA000000107379500|2020-07-04 16:09:...|July 04, 2020|16:09:06.592107|04-09-2020|2020| 7| 7| 9| 6| # |UA000000107359357|2020-07-04 15:36:...|July 04, 2020|15:36:51.756535|04-36-2020|2020| 7| 7| 36| 51| # |UA000000107375547|2020-07-04 16:06:...|July 04, 2020|16:06:55.459100|04-06-2020|2020| 7| 7| 6| 55| # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # Answer df = (df.withColumn("year", F.year(F.col("ts"))).withColumn( "month", F.month(F.col("ts"))).withColumn("dayofweek", F.dayofweek( F.col("ts"))).withColumn("minute", F.minute(F.col("ts"))).withColumn( "second", F.second(F.col("ts")))) df.show() # COMMAND ---------- # Converts the column into DateType with name "date" by casting rules to DateType (use function to_date). # Then create a column plus_two_days that adds 2 days to the date. Select "date" and "plus_two_days" # Expected: # +----------+-------------+ # | date|plus_two_days| # +----------+-------------+ # |2020-07-04| 2020-07-06| # |2020-07-04| 2020-07-06| # |2020-07-04| 2020-07-06| # +----------+-------------+
'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created''' df_nycflights = df_nycflights. \ drop('year'). \ drop('month'). \ drop('day'). \ drop('hour'). \ drop('minute'). \ drop('date'). \ drop('time') df_nycflights.show() '''Now we extract the fields back''' df_nycflights = df_nycflights. \ withColumn('year',year(df_nycflights.timestamp)). \ withColumn('month',month(df_nycflights.timestamp)). \ withColumn('day',dayofmonth(df_nycflights.timestamp)). \ withColumn('hour',hour(df_nycflights.timestamp)). \ withColumn('minute',minute(df_nycflights.timestamp)) df_nycflights.show() '''Now few operations on timestamp ''' df_nycflights = df_nycflights.\ withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \ withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \ withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp)) df_nycflights.show()
def extract_datetime_info(self, datetime_col, info_to_extract): self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', self.to_date_(datetime_col)) timestamped = datetime_col + "_timestamped" # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) uniqueVals = self._data_frame.select( datetime_col + '_temp').distinct().na.drop().limit(10).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( datetime_col + '_temp', to_date_udf(self._data_frame[datetime_col + '_temp']).alias(datetime_col + '_temp')) if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass except TypeError: if info_to_extract == "year": self._data_frame = self._data_frame.withColumn( datetime_col + "_year", year( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "month_of_year": dict = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December" } self._data_frame = self._data_frame.withColumn( datetime_col + "_month", month( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_month_of_year", self.month_to_string(dict)(col(datetime_col + "_month"))) if info_to_extract == "day_of_month": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_month", dayofmonth( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_day_of_year", dayofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "day_of_week": self._data_frame = self._data_frame.withColumn( datetime_col + "_etf_day_of_week", dayofweek(datetime_col + '_temp')) if info_to_extract == "week_of_year": self._data_frame = self._data_frame.withColumn( datetime_col + "_week_of_year", weekofyear( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "hour": self._data_frame = self._data_frame.withColumn( datetime_col + "_hour", hour( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "minute": self._data_frame = self._data_frame.withColumn( datetime_col + "_minute", minute( to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy"))) if info_to_extract == "date": self._data_frame = self._data_frame.withColumn( datetime_col + "_date", to_timestamp(self._data_frame[datetime_col + '_temp'], "dd/MM/yyyy").cast("date")) else: pass self._data_frame = self._data_frame.drop(datetime_col + '_temp') # self._data_frame = self._data_frame.withColumn(datetime_col, to_timestamp(self._data_frame[datetime_col +'_temp'], "dd/MM/yyyy")) # self._data_frame = self._data_frame.withColumn(datetime_col, F.from_unixtime(F.unix_timestamp(self._data_frame[datetime_col +'_temp']), "dd/MM/yyyy")) return self._data_frame
def _transform(self, df): self.check_input_type(df.schema) return df.withColumn(self.outputCol, F.minute(df[self.inputCol]))
def convert_timezone(item): from_zone = tz.gettz('UTC') to_zone = tz.gettz('America/New_York') dt = parser.parse(item['timestamp']) utc = dt.replace(tzinfo=from_zone) return utc.astimezone(to_zone) if dtype == "sql": return Row(id=d[0], time=convert_timezone(d[1])) elif dtype == "pandas": return convert_timezone(d[1]) n_parts = 10 rdd = sc.textFile(data_path).repartition(n_parts).cache() # partitionBy fails here, need to use repartition() filtered = (rdd.map(make_json) .filter( lambda x: filter_tweets(x,p) ) .map( get_relevant_fields, preservesPartitioning=True ) ) data = filtered.map( lambda x: update_tz(x,'sql'), preservesPartitioning=True ) df = sqlContext.createDataFrame(data).cache() counts = df.groupby(sqlfunc.minute("time")).count().collect() minutes,cts = zip(*counts) minutes = [m if m<59 else -1 for m in minutes] # for some reason 7:59 tweets get included in the 8-815 range plt.bar(minutes,cts) plt.xlabel("Minutes from 8-830pm", fontsize=16) plt.ylabel("Tweet frequency", fontsize=16) plt.savefig(path+'sep16-8pm-hist.png')
long_min = -123.0137 lat_max = 37.8324 long_max = -122.3549 x_res = (long_max - long_min) / 20 y_res = (lat_max - lat_min) / 20 dfTrainRaw = dfTrainRaw.drop('Date') dfTrainRaw = dfTrainRaw.select('IncidntNum', 'Category', 'Descript', 'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId', function.year("Dates").alias('Year'), function.month("Dates").alias('Month'), function.hour("Time").alias('Hour'), function.minute("Time").alias('Minute'), function.dayofmonth("Dates").alias('Day')) dfTrainRaw = dfTrainRaw.filter(dfTrainRaw.X < -122.3549) dfMain = dfTrainRaw dfTrain = dfTrainRaw.filter(dfTrainRaw.Year <= 2015) dfTest = dfTrainRaw.filter(dfTrainRaw.Year > 2015) #Preprocessing Train and Test print("=======TRAIN=======") dfTrain = preprocess(dfTrain) dfTrain.show(5) print("=======TEST=======") dfTest = preprocess(dfTest)
def _transform(self, df, auxiliar_train): if not self.train_file: auxiliar_train = auxiliar_train.drop('WinningBid') auxiliar_train = auxiliar_train.withColumn('test', lit(0)) df = df.withColumn('test', lit(1)) df = auxiliar_train.union(df) del auxiliar_train # We create the time as Index split_col = split(df['ApproximateDate'], ' ') df = df.withColumn('time', split_col.getItem(1)) # time # Hour Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'), IntegerType()) df = df.withColumn('hms_index', func_index(df['time'])) # We order by UserId-Date df = df.orderBy(['UserID', 'hms_index']) # We check Null Values df.select([count_(when(isnan(c), c)).alias(c) for c in df.columns]).show() # We create a rank of users by how many times in the past saw an ad w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween( Window.unboundedPreceding, 0)) df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w)) # Number of Ads/User/Second df = df.withColumn('key_id', concat(df['UserID'], lit(' '), df['hms_index'])) w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w)) # Number of Ads/User df_group = df.groupby(['key_id' ]).agg(count_('key_id').alias('count_ads')) split_col = split(df_group['key_id'], ' ') df_group = df_group.withColumn('UserID', split_col.getItem(0)) # time w = (Window().partitionBy( df_group.UserID).orderBy('key_id').rowsBetween( Window.unboundedPreceding, 0)) df_group = df_group.withColumn('number_ads_user', sum_(df_group.count_ads).over(w)) df_group = df_group.select(['key_id', 'number_ads_user']) df = df.join(df_group, how='left', on='key_id') del df_group # Number of Users/Second w = (Window().partitionBy(df.ApproximateDate).rowsBetween( -sys.maxsize, sys.maxsize)) df = df.withColumn('number_user_second', approx_count_distinct(df.UserID).over(w)) # Number of Ads/Second df = df.withColumn('number_ads_second', count_(df.ApproximateDate).over(w)) # Browser Dummy Transformation types = df.select('Browser').distinct().collect() types = [val['Browser'] for val in types] new_cols = [ when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty) for ty in types ] df = df.select(df.columns + new_cols) # Decompose Date Variables df = df.withColumn('date', to_date(df['ApproximateDate'])) # date df = df.withColumn('month', month(df['ApproximateDate'])) # month df = df.withColumn('day', dayofmonth(df['ApproximateDate'])) # day df = df.withColumn('weekday', dayofweek( df['ApproximateDate'])) # weekday 1=Monday df = df.withColumn('hour', hour(df['time'])) # hour df = df.withColumn('minute', minute(df['time'])) # minute # Peak Hour df = df.withColumn('peak6am8am', when(df['hour'].between(6, 8), 1).otherwise(0)) df = df.withColumn('peak14pm16pm', when(df['hour'].between(14, 16), 1).otherwise(0)) # Minute Index func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'), IntegerType()) df = df.withColumn('hm_index', func_index(df['time'])) # Convert to time-series by Minute # We reduce to minutes df_time_serie_ads = df.select([ 'hms_index', 'hm_index', 'number_user_second', 'number_ads_second' ]).drop_duplicates() df_time_serie_user = df.select(['UserID', 'hm_index']).drop_duplicates() # Group-by the values df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg( approx_count_distinct('UserID')) df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({ 'number_ads_second': 'sum' }).drop_duplicates(subset=['hm_index']) # Join ads-users per minute df_time_serie = df_time_serie_ads.join(df_time_serie_user, how='left', on='hm_index') del df_time_serie_ads, df_time_serie_user # Rename columns df_time_serie = df_time_serie.withColumnRenamed( 'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed( 'approx_count_distinct(UserID)', 'number_user_minute') # Resample Range of Minutes resample_range = list( range( df_time_serie.select(min_( col('hm_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hm_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hm_index).drop( *['hm_index']).fillna(0) # Create Lags By Minutes w = Window().partitionBy().orderBy(col('value')) if self.ar_min_lag > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_minute').over(w).alias( 'ar1_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('number_ads_minute').over(w).alias( 'ar1_number_ads_minute')) if self.ar_min_lag > 1: for l in range(2, self.ar_min_lag + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_minute').over( w).alias('ar' + str(l) + '_number_user_minute')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_minute').over( w).alias('ar' + str(l) + '_number_ads_minute')) # Remove the lagged Null Values df_time_serie = df_time_serie.dropna() # join and remove lag Null values of the first minute df = df.orderBy(['UserID', 'hms_index']) df = df.join(df_time_serie.orderBy(['hm_index']), how='left', on=df.hm_index == df_time_serie.value).drop('value') # Convert to time-series and resample by Seconds df_time_serie = df.select( ['hms_index', 'number_user_second', 'number_ads_second']).drop_duplicates() resample_range = list( range( df_time_serie.select(min_( col('hms_index'))).limit(1).collect()[0][0], df_time_serie.select(max_( col('hms_index'))).limit(1).collect()[0][0] + 1, 1)) resample_range = self._spark.createDataFrame(resample_range, IntegerType()) # Join the original df df_time_serie = resample_range.join( df_time_serie, how='left', on=resample_range.value == df_time_serie.hms_index).drop( *['hms_index']).fillna(0) # Create lags w = Window().partitionBy().orderBy(col('value')) if self.ar_lags > 0: df_time_serie = df_time_serie.select( '*', lag('number_user_second').over(w).alias( 'ar1_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('number_ads_second').over(w).alias( 'ar1_number_ads_second')) if self.ar_lags > 1: for l in range(2, self.ar_lags + 1, 1): df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_user_second').over( w).alias('ar' + str(l) + '_number_user_second')) df_time_serie = df_time_serie.select( '*', lag('ar' + str(l - 1) + '_number_ads_second').over( w).alias('ar' + str(l) + '_number_ads_second')) # Create Moving Average if self.ma_ss_lag is not None: # Get hour from index func_index = udf(lambda x: auxiliar_func.num_to_time(x), StringType()) df_time_serie = df_time_serie.withColumn( 'time', func_index(df_time_serie['value'])) # minute MA terms (Average per second last xx seconds) if self.ma_ss_lag is not None: for lag_val in self.ma_ss_lag: # range to take into account w = (Window.orderBy(df_time_serie['value']).rangeBetween( -lag_val, 0)) # MA variables df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', avg('number_user_second').over(w)) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', avg('number_ads_second').over(w)) # Increasing ID df_time_serie = df_time_serie.withColumn( 'rn', monotonically_increasing_id()) # Replace first values by Null df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_user_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'])) df_time_serie = df_time_serie.withColumn( 'ma_seconds_' + str(lag_val) + '_number_ads_second', when(df_time_serie['rn'] < lag_val, None).otherwise( df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'])) # Get the average by Minute df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_user_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_user_second'] * 60) df_time_serie = df_time_serie.withColumn( 'ma_minute_' + str(lag_val) + '_number_ads_second', df_time_serie['ma_seconds_' + str(lag_val) + '_number_ads_second'] * 60) df_time_serie = df_time_serie.drop(*['rn']) # Remove the lagged Null Values df_time_serie = df_time_serie.drop( *['time', 'number_user_second', 'number_ads_second']).dropna() # join and remove lag Null values of the first minute df = df.join( df_time_serie.orderBy(['value']), how='left', on=df.hms_index == df_time_serie.value).drop('value').dropna() if self.train_file and not self.variable_analysis: df = df.select([ 'key_id', 'hms_index', 'number_ads_user', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')] + ['WinningBid']) if not self.train_file: df = df.filter(df['test'] == 1) df = df.select([ 'UserID', 'key_id', 'number_ads_user', 'hms_index', 'number_user_second', 'number_ads_second', 'number_ads_user_second', 'peak6am8am', 'peak14pm16pm', 'user_id_acumulative' ] + [x for x in df.columns if x.startswith('d_browser')] + [x for x in df.columns if x.startswith('ar')] + [x for x in df.columns if x.startswith('ma_')]) df = df.orderBy(['hms_index', 'UserID']) df.show() return df
# Cast string a timestamp col time timeFormatUDF = F.udf(lambda ts: timeFormat(ts)) dataset = dataset.withColumn( "time", timeFormatUDF(F.col("time")).cast(TimestampType())) # Separamos col time dataset = dataset.withColumn("year", F.year(F.col("time"))) dataset = dataset.withColumn("month", F.month(F.col("time"))) dataset = dataset.withColumn("day", F.dayofmonth(F.col("time"))) dataset = dataset.withColumn("hour", F.hour(F.col("time"))) dataset = dataset.withColumn("minute", F.minute(F.col("time"))) dataset = dataset.withColumn("second", F.second(F.col("time"))) # Separamos MCC, MNC y MSIN de la columna IMSI dataset = dataset.withColumn('mcc', dataset.imsi.substr(1, 3)) dataset = dataset.withColumn('mnc', dataset.imsi.substr(4, 2)) dataset = dataset.withColumn('msin', dataset.imsi.substr(6, 10)) # Separamos TAC, SNR y CD de la columna IMEI # Formato de los IMEI: TAC -- Serial_Number (14 digitos) dataset = dataset.withColumn('tac', dataset.imei.substr(1, 8)) dataset = dataset.withColumn('snr', dataset.imei.substr(9, 6)) # Escalamos la columna year con MinMaxScaler en el rango [0,1]
def process_data(spark): """ Read from S3 and process bike share data into dimensional tables. the bike share data (as CSVs) is read from a public S3 bucket to dataframes. the data is transformed using pyspark.sql functions finally data is saved back to the same S3 bucket in parquet fromat Parameters: spark: Spark session """ # read from S3 to dataframes st_station_df = spark.read.csv('s3://omar-dend/station.csv', header=True) st_weather_df = spark.read.csv('s3://omar-dend/weather.csv', header=True) st_trip_df = spark.read.csv('s3://omar-dend/trip.csv', header=True) st_status_df = spark.read.csv('s3://omar-dend/status.csv', header=True) st_city_df = spark.read.csv('s3://omar-dend/city.csv', header=True) # save counts to ensure later that all rows are present station_count = st_station_df.count() weather_count = st_weather_df.count() # adding timestamp to all the dataframes to standardize datetime st_station_df = st_station_df.withColumn( 'datetime', F.to_timestamp(st_station_df.installation_date, 'MM/dd/yyyy')) st_weather_df = st_weather_df.withColumn( 'datetime', F.to_timestamp(st_weather_df.date, 'MM/dd/yyyy')) st_trip_df = st_trip_df.withColumn( 'datetime_start', F.to_timestamp(st_trip_df.start_date, 'MM/dd/yyyy HH:mm')) st_trip_df = st_trip_df.withColumn( 'datetime_end', F.to_timestamp(st_trip_df.end_date, 'MM/dd/yyyy HH:mm')) st_status_df = st_status_df.withColumn( 'datetime', F.to_timestamp(st_status_df.time, 'yyyy/MM/dd HH:mm:ss')) # create dim_weather weather_df = st_weather_df.select('max_temperature_f', 'mean_temperature_f', 'min_temperature_f', 'max_humidity', 'mean_humidity', 'min_humidity', 'max_wind_Speed_mph', 'mean_wind_speed_mph', 'precipitation_inches', 'events', 'zip_code', 'datetime')\ .dropDuplicates() # create dim_station station_df = st_station_df.select( F.col('id').alias('station_id'), F.col('name').alias('station_name'), 'lat', 'long', 'dock_count', 'city', F.col('datetime').alias('installation_datetime')) station_df = station_df.join(st_city_df, station_df.city == st_city_df.city, 'left')\ .drop('city')\ .dropDuplicates() # make sure none of station or wheather data was dropped by mistake station_dim_count = station_df.count() weather_dim_count = weather_df.count() if station_dim_count != station_count or weather_dim_count != weather_count: raise Exception('Some dimensional rows are missing') else: print('All is good') # load (save) dim_staion to S3 in parquet fromat station_df.write.mode('overwrite')\ .parquet('s3://omar-dend/dim_station') # load (save) dim_weather to S3 in parquet fromat partitioned by zip_code weather_df.write.mode('overwrite')\ .partitionBy('zip_code')\ .parquet('s3://omar-dend/dim_weather') # create dim_time time_df = st_station_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime')) time_df = st_weather_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime')) time_df = st_trip_df.select(F.col('datetime_start').alias('datetime'))\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ time_df = st_trip_df.select(F.col('datetime_end').alias('datetime'))\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ time_df = st_status_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat partitioned by year & month time_df.write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet('s3://omar-dend/dim_time') # create fact_trip trip_df = st_trip_df.select(F.col('id').alias('trip_id'), 'duration', 'bike_id', 'subscription_type', 'start_station_id', 'end_station_id', 'datetime_start', 'datetime_end')\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat trip_df.write.mode('overwrite')\ .parquet('s3://omar-dend/fact_trip') # create fact_status status_df = st_status_df.select('station_id', 'bikes_available', 'docks_available', 'datetime')\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat partitioned by station_id status_df.write.mode('overwrite')\ .partitionBy('station_id')\ .parquet('s3://omar-dend/fact_status')
mapping.append((field.name, field.dataType.typeName(), field.name, field.dataType.typeName())) dyf = dyf.apply_mapping(mapping) # Add partition columns df = dyf.toDF() if 'year' in partition_keys: df = df.withColumn('year', year(timestamp_column_name)) if 'month' in partition_keys: df = df.withColumn('month', month(timestamp_column_name)) if 'day' in partition_keys: df = df.withColumn('day', dayofmonth(timestamp_column_name)) if 'hour' in partition_keys: df = df.withColumn('hour', hour(timestamp_column_name)) if 'minute' in partition_keys: df = df.withColumn('minute', minute(timestamp_column_name)) df.drop(col(tmp_timestamp_column_name)) dyf = DynamicFrame.fromDF(df, glue_context, "add_partitions") # Write DynamicFrame to S3 in glueparquet format sink = glue_context.getSink(connection_type="s3", path=output_path, enableUpdateCatalog=True, partitionKeys=partition_keys) sink.setFormat("glueparquet") sink.setCatalogInfo(catalogDatabase=output_database, catalogTableName=output_table) sink.writeFrame(dyf) job.commit()
the_day = (sorted([ x.dayofyear for x in (df_time_ma.select( dayofyear('timestamp').alias('dayofyear')).distinct().take(5)) ]))[0] # min doesn't work on javalist the_hours = 12 the_title = ("Data for asset {}, variable {} for day {}, {} hours".format( the_asset, the_variable, the_day, the_hours)) # currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter( df_time_ma.variable == the_variable).filter( dayofyear('timestamp') == the_day).filter( hour('timestamp') <= the_hours).cache() test_df_1s = test_df.toPandas() test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas() test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter( second(df_time_ma.timestamp) == 0).toPandas() plt.figure(figsize=(12, 4)) plt.plot(test_df_1s.timestamp, test_df_1s.ma, 'b') plt.plot(test_df_60s.timestamp, test_df_60s.ma, 'r') plt.plot(test_df_10m.timestamp, test_df_10m.ma, 'g') plt.grid() plt.title(the_title) plt.legend(['1s', '60s', '10m']) display(plt.gcf()) # COMMAND ---------- from itertools import chain from pyspark.sql.functions import create_map, lit, round
def start_stream(args): validate_params(args) _, brokers, topic = args spark = create_spark_session() json = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load() json.printSchema() # Explicitly set schema schema = StructType([ StructField("symbol", StringType(), False), StructField("timestamp", TimestampType(), False), StructField("price", DoubleType(), False) ]) json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"} stocks_json = json \ .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content")) stocks_json.printSchema stocks = stocks_json.select("content.*") #################################### # Stream to Parquet #################################### query = stocks \ .withColumn('year', year(F.col('timestamp'))) \ .withColumn('month', month(F.col('timestamp'))) \ .withColumn('day', dayofmonth(F.col('timestamp'))) \ .withColumn('hour', hour(F.col('timestamp'))) \ .withColumn('minute', minute(F.col('timestamp'))) \ .writeStream \ .format('parquet') \ .partitionBy('year', 'month', 'day', 'hour', 'minute') \ .option('startingOffsets', 'earliest') \ .option('checkpointLocation', '/dataset/checkpoint') \ .option('path', '/dataset/streaming.parquet') \ .trigger(processingTime='30 seconds') \ .start() query.awaitTermination() # avg_pricing = stocks \ # .groupBy(F.col("symbol")) \ # .agg(F.avg(F.col("price")).alias("avg_price")) #################################### # Console Output #################################### # query2 = avg_pricing.writeStream \ # .outputMode('complete') \ # .format("console") \ # .trigger(processingTime="10 seconds") \ # .start() # query2.awaitTermination() #################################### # Table in Memory #################################### # query3 = avg_pricing \ # .writeStream \ # .queryName("avgPricing") \ # .outputMode("complete") \ # .format("memory") \ # .trigger(processingTime="10 seconds") \ # .start() # # while True: # print('\n' + '_' * 30) # # interactively query in-memory table # spark.sql('SELECT * FROM avgPricing').show() # print(query3.lastProgress) # sleep(10) # query3.awaitTermination() #################################### # Writing to Postgres #################################### # Simple insert # query = stream_to_postgres(stocks) # query.awaitTermination() # Average Price Aggregation # query = stream_aggregation_to_postgres(stocks) # query.awaitTermination() # Final Average Price Aggregation with Timestamp columns # query = stream_aggregation_to_postgres_final(stocks) # query.awaitTermination() pass
dfinrixm = dfinrix.filter("Speed<0.6 * Reference") # COMMAND ---------- dftimestamp = dfinrixm.withColumn( 'NT', substring('CentralTime', 1, 19).astype("Timestamp")).drop("C-Value", "SegmentClosed", "Score", "Speed", "Average", "Reference", "Travel", "Time") # COMMAND ---------- dfmsm = dftimestamp.withColumn( "msm", hour(dftimestamp.NT) * 60 + minute(dftimestamp.NT)).drop("CentralTime") # COMMAND ---------- # COMMAND ---------- import pyspark.sql.functions as F import pyspark.sql.types as T from pyspark.sql.functions import col from pyspark.sql.types import IntegerType, ArrayType, StructType # COMMAND ---------- dfb = dfmsm.sort("Code", "msm") # COMMAND ----------
def main(): spark = SparkSession \ .builder \ .appName("spark_streaming_app") \ .getOrCreate() df = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', '104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092').option( 'subscribe', 'stream_data').option('startingOffsets', 'earliest').load()) df = df.selectExpr('CAST(value as STRING)') df = df.select(from_json(col('value'), data_schema).alias('df')) func1 = udf(lambda x: states[x.upper()], StringType()) df = df.filter(col('df.group.group_country')=='us').select('df').withColumn('group_state', func1('df.group.group_state')).\ withColumn('time', from_unixtime(col('df.event.time')/1000)) df2 = df.select( struct( struct( col('df.event.event_name'), col('df.event.event_id'), col('time'), ).alias('event'), col('df.group.group_city'), col('df.group.group_country'), col('df.group.group_id'), col('df.group.group_name'), col('group_state')).alias('value')) stream2 = df2.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "US-meetups") \ .option("checkpointLocation", "US-metups-checkpoint") stream2 = stream2.start() df3 = df.withColumn('timestamp', to_timestamp('time')).\ withWatermark('timestamp', "1 minute").groupBy(window('timestamp', '1 minute')).\ agg(struct(month('window.end').alias('month'), dayofmonth('window.end').alias('day_of_the_month'), hour('window.end').alias('hour'), minute('window.end').alias('minute'),collect_set('df.group.group_city').alias('cities')).alias('value')).\ select('value') stream3 = df3.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "US-cities-every-minute") \ .option("checkpointLocation", "US-cities-every-minute-checkpoint") stream3 = stream3.start() df4 = df.select( struct( struct( col('df.event.event_name'), col('df.event.event_id'), col('time'), ).alias('event'), col('df.group.group_topics.topic_name'), col('df.group.group_city'), col('df.group.group_country'), col('df.group.group_id'), col('df.group.group_name'), col('group_state')).alias('value')).filter( arrays_overlap( 'value.topic_name', array(lit("Computer programming"), lit("Big Data"), lit("Machine Learning"), lit("Python"), lit("Java"), lit("Web Development")))) stream4 = df4.select(to_json('value').alias('value')).writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers",'104.248.248.196:9092,134.122.78.61:9092,134.209.225.2:9092') \ .option("topic", "Programming-meetups") \ .option("checkpointLocation", "Programming-metups-checkpoint") stream4 = stream4.start() stream4.awaitTermination() spark.stop()
def process_log_data(spark, input_data_path): pl_start = time() print('Starting to process log data') # get filepath to log data file log_data = input_data_path # read log data file #df = log_schema = StructType([ StructField("artist", StringType()), StructField("auth", StringType()), StructField("firstName", StringType()), StructField("gender", StringType()), StructField("itemInSession", LongType()), StructField("lastName", StringType()), StructField("length", DoubleType()), StructField("level", StringType()), StructField("location", StringType()), StructField("method", StringType()), StructField("page", StringType()), StructField("registration", DoubleType()), StructField("sessionId", LongType()), StructField("song", StringType()), StructField("status", StringType()), StructField("ts", StringType()), StructField("userAgent", StringType()), StructField("userId", StringType()) ]) log_df = spark.read.json(input_data_path, schema=log_schema) # filter by actions for song plays # Filter only column page with value "NextSong" #df = log_df = log_df.filter(log_df.page == 'NextSong').collect() # Convert List to Spark log_df = spark.createDataFrame(log_df, schema=log_schema) # Convert ts from long to datetime convert_ts = udf( lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0), TimestampType()) log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts)) # Convert registration from double to long log_df = log_df.withColumn("registration_converted", log_df.registration.cast(LongType())) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Read & Transformation', round(pl_et, 2))) print('Creating users table') temp_start = time() # extract columns for users table # creating users table with columns user_id, first_name, last_name, gender, level users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\ .withColumnRenamed('userId', 'user_id')\ .withColumnRenamed('firstName', 'first_name')\ .withColumnRenamed('lastName', 'last_name').dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating users table', round(pl_et, 2))) # extract columns to create time table # Creating time table with columns start_time, hour, day, week, month, year, weekday print('Creating time table') temp_start = time() time_table = log_df.select(['ts_converted'])\ .withColumnRenamed('ts_converted','start_time') time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('year', F.year('start_time')) \ .withColumn('hour', F.hour('start_time')) \ .withColumn('minute', F.minute('start_time')) \ .withColumn('second', F.second('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating time table', round(pl_et, 2))) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Total', round(pl_et, 2))) return log_df, users_table, time_table
bicimadc_ds = bicimadc_ds.withColumnRenamed("light", "occupation") bicimad_filtered = bicimadc_ds.filter(sf.col("activate") == "1") bicimad_coordinates = bicimad_filtered.withColumn("geometry", sf.substring("geometry", 35, 40))\ .withColumn("longitud", sf.split("geometry", ",")[0]) \ .withColumn("latitud", sf.split("geometry", ",")[1]) \ .withColumn("latitud", sf.expr("substring(latitud, 2, length(latitud)-3)")) \ .drop("geometry") bicimad_partition = bicimad_coordinates\ .withColumn("year", year("datetime")) \ .withColumn("month", month("datetime")) \ .withColumn("day", dayofmonth("datetime")) \ .withColumn("hour", hour("datetime")) \ .withColumn("minute", minute("datetime")) # Envío del resultado a kafka en micro-batches queryToKafka = bicimad_partition\ .select(bicimad_partition["id"].cast('string').alias("key"), to_json(struct("*")).alias("value"))\ .writeStream \ .format("kafka") \ .trigger(processingTime='3 minutes') \ .option("kafka.bootstrap.servers", 'localhost:9092') \ .option("topic", "bicimad-druid-stream") \ .option("checkpointLocation", "/tmp/checkpoint/kafka/stream/bicimad/") \ .outputMode("Append") \ .start() #old checkpoint path: /tmp/checkpoint/kafka/bicimad/