def main(): spark = SparkSession(sc) tweet_schema = types.StructType([ types.StructField('topic', types.StringType(), True), types.StructField('text', types.StringType(), True), types.StructField('created_at', types.StringType(), True), ]) tweets = spark.read.json('kafka_data_2.json', schema=tweet_schema) tweets = tweets.select( col('topic').alias('topic-sentiment'), col('text').alias('text_original'), col('created_at')) tweets = tweets.withColumn( 'text_clean', regexp_replace( tweets['text_original'], '(#\w+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ')) TextBlob_udf = udf(sentiment_calc) tweets = tweets.withColumn("sentiment_score", TextBlob_udf(tweets.text_clean).cast('double')) tweets = tweets.withColumn("sentiment_score_rounded", func.round(tweets['sentiment_score'], 1)) datefunc = udf( lambda x: datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y'), TimestampType()) tweets = tweets.withColumn( 'created_at_PST_onestep', func.from_utc_timestamp(datefunc(tweets['created_at']), "PST"))
def dim_user(self, tweets:DataFrame)->DataFrame: dim_user = ( tweets .withColumn( "user_created_at", F.from_utc_timestamp( F.to_timestamp( F.regexp_replace( F.regexp_replace("user_created_at", "^[A-Za-z]{3} ", ""), "\+0000 ", ""), "MMM dd HH:mm:ss yyyy" ), "GMT-3" ) ) .select( "user_id", "user_name", "user_description", F.col("user_verification").cast(BooleanType()), F.col("user_followers_count").cast(IntegerType()), F.col("user_friends_count").cast(IntegerType()), "user_created_at", "user_location", ) ) return dim_user
def adjust_timezone(df): """Adjust timezone to KST from UTC :param df: Input DataFrame :return: Output DataFrame """ return ( df .withColumn('timestamp', from_utc_timestamp(unix_timestamp(df['timestamp'], "yyyy-MM-dd'T'HH:mm:ss.SSSXXX") .cast('timestamp'), 'KST')))
def convert_timestamp_to_date(self,df): """ TODO find out if necassary to convert to timestamp, i.e. what is the servers timestamp CET : central eastern time : UTC + 1 CEST: central eastern summer time : UTC + 2 """ utc_to_time_udf = udf(utc_to_time) df.show(10) UTC = df.withColumn("sessionstarttime", utc_to_time_udf("sessionstarttime")) #print("SHOULD BE IN LOCAL TIME") UTC.show(10) return UTC.withColumn("sessionstarttime", from_utc_timestamp("sessionstarttime", "CET")) #CEST?
def utc_to_local(self, localTimeZone, subset=None): if subset is not None: tsCol = ( x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.name in subset) else: tsCol = (x for x in self.inputSchema if str(x.dataType) == "TimestampType") for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, from_utc_timestamp(col(x.name), localTimeZone)) return self.inputDf
def ad_stream_translator(df): from pyspark.sql.types import StructType, StringType from pyspark.sql.functions import from_json, col, from_utc_timestamp, when payload_schema = ( StructType() .add("clicked", StringType(), False) .add("auction_id", StringType(), False) .add("num_ads_bid", StringType(), False) .add("ad_id", StringType(), False) .add("ad_campaign_id", StringType(), False) .add("partner_domain_name", StringType(), False) .add("content_keyword", StringType(), False) .add("ad_content_id", StringType(), False) .add("ad_group_id", StringType(), False) .add("ad_display_placement", StringType(), False) .add("ad_destination_domain_id", StringType(), False) .add("partner_id", StringType(), False) .add("is_pwa", StringType(), False) .add("user_uuid", StringType(), False) .add("timestamp", StringType(), False) .add("datestr", StringType(), True) ) return ( df.selectExpr("cast (data as STRING) jsonData") .select(from_json("jsonData", payload_schema).alias("payload")) .select( col("payload.clicked").cast("long").alias("clicked"), col("payload.auction_id").alias("auction_id"), col("payload.num_ads_bid").cast("long").alias("num_ads_bid"), col("payload.ad_id").cast("long").alias("ad_id"), col("payload.ad_campaign_id").cast("long").alias("ad_campaign_id"), col("payload.partner_domain_name").alias("partner_domain_name"), col("payload.content_keyword").alias("content_keyword"), col("payload.ad_content_id").cast("long").alias("ad_content_id"), col("payload.ad_group_id").alias("ad_group_id"), col("payload.ad_display_placement").alias("ad_display_placement"), col("payload.ad_destination_domain_id").cast("long").alias("ad_destination_domain_id"), col("payload.partner_id").cast("long").alias("partner_id"), when( col("payload.is_pwa") == "True", True).when( col("payload.is_pwa") == "False", False).alias("is_pwa"), col("payload.user_uuid").alias("user_uuid"), from_utc_timestamp("payload.timestamp", "UTC").alias("timestamp"), col("payload.datestr").alias("datestr") ) )
def changeTimezone(self, fromTimezone, toTimezone, subset=None): if subset is not None: tsCol = ( x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.name in subset) else: tsCol = (x for x in self.inputSchema if str(x.dataType) == "TimestampType") for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, to_utc_timestamp(col(x.name), fromTimezone)) self.inputDf = self.inputDf.withColumn( x.name, from_utc_timestamp(col(x.name), toTimezone)) return self.inputDf
def read_pcap_csv(filename, spark_session): schema = build_csv_schema() df = spark_session.read.csv( filename, header=True, sep=";", schema=schema ) df = df \ .withColumnRenamed("frame.time", "time") \ .withColumnRenamed("frame.time_epoch", "time_epoch") \ .withColumnRenamed("_ws.col.Source", "Source") \ .withColumnRenamed("_ws.col.Destination", "Destination") \ .withColumnRenamed("_ws.col.Protocol", "Protocol") \ .withColumnRenamed("frame.len", "frame_len") df = df.withColumn('parsed_time', F.from_utc_timestamp(df['time_epoch'].cast(TimestampType()), 'UTC')) df = df.withColumn("id", F.monotonically_increasing_id()) return df.orderBy("parsed_time")
def process_log_data(spark, input_data, output_data): ''' This function to read log data from JSON files, then will extract columns for users, time & songplays tables, then write their data into parquet files. Following steps will be performed: 1. Specify the path of song data JSON files is "s3a://udacity-dend/log_data/2018/11/*.json". It means all JSON files located inside subdirectories of "s3a://udacity-dend/log_data/2018/11" will be read. Data will be read into a dataframe. 2. Filtering the dataframe at step 1 to select only rows with 'page' of 'NextSong', and also remove records which has NA value in one of following columns: 'ts', 'song', 'userId', 'lastName' or 'firstName'. 3. Call function process_users_table() to process users table. 4. Adding new columns into log dataframe, then call function process_time_table() to process time table. 5. Call function process_songplays_table() to process songplays table. ''' # get filepath to log data file log_data = input_data + "log_data/2018/11/*.json" # read log data file df = read_log_data(spark, log_data) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong').dropna( subset=['ts', 'song', 'userId', 'firstName', 'lastName']) # create timestamp column from original timestamp column df = df.withColumn("timesttamp", F.from_unixtime(df.ts / 1000)) # create datetime column from original timestamp column df = df.withColumn("datetime", F.from_utc_timestamp(df["timesttamp"], 'PST')) # process users table process_users_table(df, output_data) # process time table process_time_table(df, output_data) # process songplays table process_songplays_table(df, output_data)
def fact_tweet(self, tweets:DataFrame)->DataFrame: assert type(tweets) is DataFrame fact_tweet = ( tweets .withColumn( "created_at", F.from_utc_timestamp( F.to_timestamp( F.regexp_replace( F.regexp_replace("created_at", "^[A-Za-z]{3} ", ""), "\+0000 ", ""), "MMM dd HH:mm:ss yyyy" ), "GMT-3" ) ) .withColumn( "hashtags", F.when( F.col("hashtags") != '', F.regexp_replace("hashtags", ",\s$", "") ) .otherwise(None) ) .withColumn("created_at_partition_year", F.date_format("created_at", "yyyy")) .withColumn("created_at_partition_month", F.date_format("created_at", "MM")) .withColumn("created_at_partition_day", F.date_format("created_at", "dd")) .withColumn("created_at_partition_hour", F.date_format("created_at", "HH")) .select( "tweet_id", "created_at", "text", "hashtags", F.col("retweet_count").cast(IntegerType()), F.col("possibly_sensitive").cast(BooleanType()), "lang", "user_id", "created_at_partition_year", "created_at_partition_month", "created_at_partition_day", "created_at_partition_hour", ) ) return fact_tweet
def add_cyclical_features(df, source_column, target_column): df = df.withColumn(target_column, F.from_utc_timestamp(F.col(source_column), "UTC")) hour = target_column + "_hour" weekday = target_column + "_weekday" month = target_column + "_month" year = target_column + "_year" df = df.withColumn(hour, F.hour(F.col(target_column))) df = df.withColumn(weekday, F.dayofweek(F.col(target_column))-1) df = df.withColumn(month, F.month(F.col(target_column))-1) df = df.withColumn("DerivedTime_cos_" + hour, F.cos(F.col(hour) * (2.0 * np.pi / 24)) + 1) df = df.withColumn("DerivedTime_sin_" + hour, F.sin(F.col(hour) * (2.0 * np.pi / 24)) + 1) df = df.withColumn("DerivedTime_cos_" + weekday, F.cos(F.col(weekday) * (2.0 * np.pi / 7)) + 1) df = df.withColumn("DerivedTime_sin_" + weekday, F.sin(F.col(weekday) * (2.0 * np.pi / 7)) + 1) df = df.withColumn("DerivedTime_cos_" + month, F.cos(F.col(month) * (2.0 * np.pi / 12)) + 1) df = df.withColumn("DerivedTime_sin_" + month, F.sin(F.col(month) * (2.0 * np.pi / 12)) + 1) df = df.withColumn("DerivedTime_" + year, F.year(F.col(target_column))) drop_cols = [source_column, hour, weekday, month] df = df.drop(*drop_cols) return df
import pyspark.sql.functions as f reader = spark.read # ok we can get the table label_editors = reader.parquet( "/user/nathante/ores_bias_data/ores_label_editors") # get the edit history mw_hist = reader.table("wmf.mediawiki_history") mw_hist = mw_hist.filter(f.col("snapshot") == "2019-03") mw_hist = mw_hist.filter(f.col("event_entity") == "revision") mw_hist = mw_hist.select( ["revision_id", "event_timestamp", "event_user_id", "wiki_db"]) mw_hist = mw_hist.withColumn( "timestamp", f.from_utc_timestamp(f.col("event_timestamp"), tz="utc")) # we want to identify newcomers and anons # anons are just the folks without a userid label_editors = label_editors.withColumn("is_anon", (f.col('userid') == 0)) # newcomers are not anons non_anons = label_editors.filter(f.col("is_anon") == False) # find the edits by the editors edit_histories = non_anons.join(mw_hist, on=[ non_anons.userid == mw_hist.event_user_id, non_anons.wiki == mw_hist.wiki_db, non_anons.revid >= mw_hist.revision_id ])
def split(intersections, csv_report): logging.info('Splitting started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark initialization') sc = SparkContext(cfg.spark.master, 'map_test: split') sqlContext = SQLContext(sc) logging.info('Source file reading') df = sqlContext.read.json(cfg.splitting.source_file) df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC")) users_with_event_count = df.groupBy(F.col("entityId").alias("user")).count() logging.info('Filter users with small number of events') min_events = 10 users_with_few_events = (users_with_event_count .filter("count < %d" % (min_events)) .select(F.col("user").alias("user_with_few_events"))) ndf = df.join(users_with_few_events, F.col("entityId")==F.col("user_with_few_events"), how="left_outer") df1 = ndf.filter("user_with_few_events is NULL").drop("user_with_few_events") logging.info('Split data into train and test') train_df, test_df = split_data(df) train_df.write.json(cfg.splitting.train_file, mode="overwrite") test_df.write.json(cfg.splitting.test_file, mode="overwrite") train_df = train_df.select("entityId", "event", "targetEntityId").cache() test_df = test_df.select("entityId", "event", "targetEntityId").cache() logging.info('Calculation of different stat metrics of datasets') events_by_type = (df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_total")) .toPandas()) events_by_type_test = (test_df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_test")) .toPandas() .set_index("event")) events_by_type_train = (train_df .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("count_train")) .toPandas() .set_index("event")) unique_users_by_event = (df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_total")) .toPandas() .set_index("event")) unique_users_by_event_train = (train_df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_train")) .toPandas() .set_index("event")) unique_users_by_event_test = (test_df .select(F.col("entityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_users_test")) .toPandas() .set_index("event")) unique_items_by_event = (df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_total")) .toPandas() .set_index("event")) unique_items_by_event_train = (train_df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_train")) .toPandas() .set_index("event")) unique_items_by_event_test = (test_df .select(F.col("targetEntityId"), F.col("event")) .distinct() .groupBy("event") .count() .select(F.col("event"), F.col("count").alias("unique_items_test")) .toPandas() .set_index("event")) logging.info('Calculate total counts') events = df.count() events_train = train_df.count() events_test = test_df.count() unique_users = df.select("entityId").distinct().count() unique_users_train = train_df.select("entityId").distinct().count() unique_users_test = test_df.select("entityId").distinct().count() unique_items = df.select(F.col("targetEntityId")).distinct().count() unique_items_train = train_df.select(F.col("targetEntityId")).distinct().count() unique_items_test = test_df.select(F.col("targetEntityId")).distinct().count() info_df = events_by_type dfs = [unique_users_by_event, unique_items_by_event, events_by_type_train, events_by_type_test, unique_users_by_event_train, unique_users_by_event_test, unique_items_by_event_train, unique_items_by_event_test] for data_frame in dfs: info_df = info_df.join(data_frame, on="event") n_rows, n_cols = info_df.shape # totals info_df.loc[n_rows] = ['ANY EVENT', events, unique_users, unique_items, events_train, events_test, unique_users_train, unique_users_test, unique_items_train, unique_items_test] info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2]) info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3]) logging.info('Create event stat worksheet') reporter.start_new_sheet('Events stat') reporter.report( ['event', 'event count', 'unique users', 'unique items', 'events per user', 'events per item', 'event count train', 'event count test', 'unique users train', 'unique users test', 'unique items train', 'unique items test'], [column.tolist() for _, column in info_df.iteritems()], selected_rows=[next(info_df.iteritems())[1].tolist().index(cfg.testing.primary_event)], cfg=cfg) reporter.finish_sheet() if intersections: logging.info('Start intersections calculation') reporter.start_new_sheet('Intersections') columns_for_matrix = cfg.testing.events logging.info('Process train / train user intersection') train_train_users = ( train_df .select(F.col("entityId").alias("user"), F.col("event").alias("event_left")) .distinct() .join(train_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtru = mk_intersection_matrix(train_train_users, columns_for_matrix) reporter.report( [''] + list(trtru.columns.values), [trtru.index.tolist()] + [column for _, column in trtru.iteritems()], title='Train / train user intersection') logging.info('Process train / test user intersection') train_test_users = ( train_df .select(F.col("entityId").alias("user"), F.col("event").alias("event_left")) .distinct() .join(test_df.select(F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report( [''] + list(trtsu.columns.values), [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()], title='Train / test user intersection') logging.info('Process train / train item intersection') train_train_items = ( train_df .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")) .distinct() .join(train_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtri = mk_intersection_matrix(train_train_items, columns_for_matrix) reporter.report( [''] + list(trtri.columns.values), [trtri.index.tolist()] + [column for _, column in trtri.iteritems()], title='Train / train item intersection' ) logging.info('Process train / test item intersection') train_test_items = ( train_df .select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")) .distinct() .join(test_df.select(F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner") .groupBy(["event_left", "event_right"]) .count() .collect()) trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report( [''] + list(trtsi.columns.values), [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()], title='Train / test item intersection' ) reporter.report_config(cfg) reporter.finish_document() logging.info('Splitting finished successfully')
.show(5)) # COMMAND ---------- withDate = filtered.withColumn('date', func.date_format('timestamp', 'MM/dd/yyyy')) withDate.printSchema() withDate.select('title', 'timestamp', 'date').show(3) # COMMAND ---------- # MAGIC %md # MAGIC It seems like we want a different function for time zone manipulation and to store the object as a timestamp rather than a string. Let's use `from_utc_timestamp` to get a timestamp object back with the correct time zone. # COMMAND ---------- withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam')) withCEST.printSchema() (withCEST .select('timestamp', 'cest_time') .show(3, False)) # COMMAND ---------- # MAGIC %md # MAGIC Next, let's convert the text field to lowercase. We'll use the `lower` function for this. # COMMAND ---------- lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))
def split(intersections, csv_report): logging.info('Splitting started') if csv_report: if cfg.reporting.use_uuid: uuid = uuid4() reporter = CSVReport(cfg.reporting.csv_dir, uuid) else: reporter = CSVReport(cfg.reporting.csv_dir, None) else: reporter = ExcelReport(cfg.reporting.file) logging.info('Spark initialization') sc = SparkContext(cfg.spark.master, 'map_test: split') sqlContext = SQLContext(sc) logging.info('Source file reading') df = sqlContext.read.json(cfg.splitting.source_file) df = df.withColumn("Date", F.from_utc_timestamp("eventTime", "UTC")) users_with_event_count = df.groupBy( F.col("entityId").alias("user")).count() logging.info('Filter users with small number of events') min_events = 10 users_with_few_events = (users_with_event_count.filter( "count < %d" % (min_events)).select( F.col("user").alias("user_with_few_events"))) ndf = df.join(users_with_few_events, F.col("entityId") == F.col("user_with_few_events"), how="left_outer") df1 = ndf.filter("user_with_few_events is NULL").drop( "user_with_few_events") logging.info('Split data into train and test') train_df, test_df = split_data(df) train_df.write.json(cfg.splitting.train_file, mode="overwrite") test_df.write.json(cfg.splitting.test_file, mode="overwrite") train_df = train_df.select("entityId", "event", "targetEntityId").cache() test_df = test_df.select("entityId", "event", "targetEntityId").cache() logging.info('Calculation of different stat metrics of datasets') events_by_type = (df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_total")).toPandas()) events_by_type_test = (test_df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_test")).toPandas().set_index("event")) events_by_type_train = (train_df.groupBy("event").count().select( F.col("event"), F.col("count").alias("count_train")).toPandas().set_index("event")) unique_users_by_event = (df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_total")).toPandas().set_index( "event")) unique_users_by_event_train = (train_df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_train")).toPandas().set_index( "event")) unique_users_by_event_test = (test_df.select( F.col("entityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_users_test")).toPandas().set_index( "event")) unique_items_by_event = (df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_total")).toPandas().set_index( "event")) unique_items_by_event_train = (train_df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_train")).toPandas().set_index( "event")) unique_items_by_event_test = (test_df.select( F.col("targetEntityId"), F.col("event")).distinct().groupBy("event").count().select( F.col("event"), F.col("count").alias("unique_items_test")).toPandas().set_index( "event")) logging.info('Calculate total counts') events = df.count() events_train = train_df.count() events_test = test_df.count() unique_users = df.select("entityId").distinct().count() unique_users_train = train_df.select("entityId").distinct().count() unique_users_test = test_df.select("entityId").distinct().count() unique_items = df.select(F.col("targetEntityId")).distinct().count() unique_items_train = train_df.select( F.col("targetEntityId")).distinct().count() unique_items_test = test_df.select( F.col("targetEntityId")).distinct().count() info_df = events_by_type dfs = [ unique_users_by_event, unique_items_by_event, events_by_type_train, events_by_type_test, unique_users_by_event_train, unique_users_by_event_test, unique_items_by_event_train, unique_items_by_event_test ] for data_frame in dfs: info_df = info_df.join(data_frame, on="event") n_rows, n_cols = info_df.shape # totals info_df.loc[n_rows] = [ 'ANY EVENT', events, unique_users, unique_items, events_train, events_test, unique_users_train, unique_users_test, unique_items_train, unique_items_test ] info_df.insert(4, 'events per user', info_df.ix[:, 1] / info_df.ix[:, 2]) info_df.insert(5, 'events per item', info_df.ix[:, 1] / info_df.ix[:, 3]) logging.info('Create event stat worksheet') reporter.start_new_sheet('Events stat') reporter.report([ 'event', 'event count', 'unique users', 'unique items', 'events per user', 'events per item', 'event count train', 'event count test', 'unique users train', 'unique users test', 'unique items train', 'unique items test' ], [column.tolist() for _, column in info_df.iteritems()], selected_rows=[ next(info_df.iteritems())[1].tolist().index( cfg.testing.primary_event) ], cfg=cfg) reporter.finish_sheet() if intersections: logging.info('Start intersections calculation') reporter.start_new_sheet('Intersections') columns_for_matrix = cfg.testing.events logging.info('Process train / train user intersection') train_train_users = (train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_left")).distinct().join( train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtru = mk_intersection_matrix(train_train_users, columns_for_matrix) reporter.report([''] + list(trtru.columns.values), [trtru.index.tolist()] + [column for _, column in trtru.iteritems()], title='Train / train user intersection') logging.info('Process train / test user intersection') train_test_users = (train_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_left")).distinct().join( test_df.select( F.col("entityId").alias("user"), F.col("event").alias("event_right")).distinct(), on="user", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtsu = mk_intersection_matrix(train_test_users, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report([''] + list(trtsu.columns.values), [trtsu.index.tolist()] + [column for _, column in trtsu.iteritems()], title='Train / test user intersection') logging.info('Process train / train item intersection') train_train_items = (train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")).distinct().join( train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtri = mk_intersection_matrix(train_train_items, columns_for_matrix) reporter.report([''] + list(trtri.columns.values), [trtri.index.tolist()] + [column for _, column in trtri.iteritems()], title='Train / train item intersection') logging.info('Process train / test item intersection') train_test_items = (train_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_left")).distinct().join( test_df.select( F.col("targetEntityId").alias("item"), F.col("event").alias("event_right")).distinct(), on="item", how="inner").groupBy(["event_left", "event_right"]).count().collect()) trtsi = mk_intersection_matrix(train_test_items, columns_for_matrix, horizontal_suffix=" train", vertical_suffix=" test") reporter.report([''] + list(trtsi.columns.values), [trtsi.index.tolist()] + [column for _, column in trtsi.iteritems()], title='Train / test item intersection') reporter.report_config(cfg) reporter.finish_document() logging.info('Splitting finished successfully')
def process_sdf(sdf_drive, sdf_vehicle): sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"]) sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0) sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \ (F.date_sub(F.next_day( F.from_utc_timestamp(F.col("datetime"), "America/New_York"), 'monday'), 7))) sdf_Active_horsepower = sdf_drive_start_of_week.withColumn("Active_horsepower" , (F.col("eng_load") / 255) \ * (F.col("max_torque") * F.col("rpm")) / 5252) # Horsepower utilization – Active horsepower / Max Horsepower sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower")) # # Torque Utilization - calculated as Engine load/ 255 sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255) # # RPM Utilization – RPM / Maximum horsepower rpm sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") ) sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s", F.when((F.col("Torque_Utilization") >= 0.6) \ & (F.col("Torque_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \ & (F.col("Torque_Utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \ & (F.col("Torque_Utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \ & (F.col("Torque_Utilization") < 1), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \ & (F.col("Horsepower_utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \ & (F.col("Horsepower_utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \ & (F.col("Horsepower_utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \ & (F.col("Horsepower_utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \ & (F.col("RPM_Utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \ & (F.col("RPM_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ \ sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \ .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \ F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \ F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \ F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \ F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \ F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \ F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \ F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \ F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \ F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), ) sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date", F.date_format(F.col("week_start_date"), "yyyy-MM-dd")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0) return sdf_sdf_engine_features_final
aws_secret_access_key=secret_key, region_name="us-west-2") srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day + "" tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/" dfjson = sparkSession.read.format("json").option("multiline", "true").option( "inferSchema", "true").load(srcfilePath) data = dfjson.withColumn("data", explode("DATA")).select("data.*") # dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss")) dfPT = data.withColumn( "createdDatePT", sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "), "US/Pacific")) df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select( [col for col in df.columns if not col.startswith("xmlns")]) #dfbaseData.show(10,False) dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \ .where(sf.col("row_num") == 1) \ .select(dfbaseData["*"]) dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
park_data = spark.read.format("csv").option("inferSchema", 'true').load("dbfs:/FileStore/tables/AllFiltParkPred.txt") park_data = park_data.selectExpr("_c0 as name", "_c1 as free","_c2 as weekday","_c3 as month","_c4 as day","_c5 as year","_c6 as hour") park_data = park_data.withColumn("hour",substring("hour",0,2)) park_data = park_data.withColumn("hour",park_data["hour"].cast(IntegerType())) park_data.createOrReplaceTempView("dparking") #display(park_data) #park_data = park_data.withColumn("") # COMMAND ---------- import pandas as pd from pyspark.sql.functions import from_utc_timestamp weather = spark.read.format("csv").option("header", "true").option("multiline",'true').option("inferSchema", 'true').load("dbfs:/FileStore/tables/weather.csv") weather = weather.withColumn('date', weather.date.cast('timestamp')) weather = weather.withColumn('date', from_utc_timestamp(weather.date, "CET")) #weather = weather.withColumn("date",substring("date",0,2)) #weather = weather.withColumn("date", unix_timestamp(weather["date"], "yyyy-MM-dd hh")) display(weather) weather.createOrReplaceTempView("weather") weather_pd = weather.toPandas() weather_pd["date"] = pd.to_datetime(weather_pd["date"], format = "%Y-%m-%d-%H") weather_pd = weather_pd.set_index('date') #weather_pd["temperature"] # COMMAND ---------- #FAI ESTREMA ATTENZIONE: ALCUNI DATI MANCANO DAL FILE METEREOLOGICO!!!! RAGIONE PER CUI DOVRAI FARE PARTICOLARE ATTENZIONE CHE QUANDO FARAI IL JOIN CON GLI ALTRI DATI, LADDOVE MANCHINO DATI METEREOLOGICI, GLI ALTRI DATI VENGANO SCARTATI .. check_weather = spark.sql("select * from weather where month(date) = 08") display(check_weather)#chiaramente mancano i dati dal 5 al 10 agosto, ma anche altrove
client = boto3.client('glue', region_name='us-west-2') ###################################### #### CONNECTION BLOCK #### ###################################### ## tire connection tire_DS = glueContext.create_dynamic_frame.from_catalog( database="staging_combined", table_name="tire", transformation_ctx="tire_DS") tire_regDF = tire_DS.toDF() tire_regDF = tire_regDF.withColumn( 'sourcesystemtimestamp', from_utc_timestamp('rovertimestamp', 'America/New_York')).withColumn( 'sourcesystemdate', to_date(from_utc_timestamp('rovertimestamp', 'America/New_York'))) tire_noDel = tire_regDF.filter(tire_regDF['dboperationtype'] != 'D') tire_drop = tire_noDel.drop('dboperationtype').drop('audtdateadded') tire_distDF = tire_drop.distinct() tire_dynDF = DynamicFrame.fromDF(tire_distDF, glueContext, "nested") ## tiredetail connection tiredet_ds = glueContext.create_dynamic_frame.from_catalog( database="staging_combined", table_name="tiredetail", transformation_ctx="tiredet_ds") tiredet_regDF = tiredet_ds.toDF() tiredet_regDF = tiredet_regDF.withColumn( 'sourcesystemtimestamp',
def main(): """Main function""" # Get args args = get_args() # container container_in = args.container_in container_out = args.container_out # Azure credentials sas_token = args.sas storage_account_name = args.storage azure_accounts = list() azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_in }) azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_out }) oauth_login = args.oauth_login oauth_client_id = args.oauth_client_id oauth_client_secret = args.oauth_client_secret # requires hadoop 3.2+ # azure_oauth = { # "endpoint": oauth_login, # "client-id": oauth_client_id, # "client-secret": oauth_client_secret # } azure_oauth = False # VM cores = args.vm_cores ram = args.vm_ram shuffle_partitions = args.shuffle_partitions # Date, state country = args.country state = args.state # process config roam_dist_stops = args.roam_dist_stops roam_dist_events = args.roam_dist_events # Path in - path out blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/stoplocation-v8_r70-s5-a70-h6/country={country}/year=2020/" if azure_oauth: # we can leverage abfss blob_in = f"abfss://{container_in}@{storage_account_name}.dfs.core.windows.net/stoplocation-v8_r70-s5-a70-h6/country={country}/year=2020/" timezones_in = "abfss://[email protected]/utils_states_timezones/" path_out_distinct = f"distinct_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m/country={country}" path_out_all = f"all_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m/country={country}" # config spark conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts, azure_oauth) # set prop for handling partition columns as strings (fixes prefixes as int) conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false") # Create spark session sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession # Init azure client blob_service_client = BlobServiceClient.from_connection_string( CONN_STRING.format(storage_account_name, sas_token)) # build keys, date is mandatory, prefix opt partition_key = f"state={state}" print("process "+partition_key) start_time = time.time() local_dir = LOCAL_PATH+partition_key print("write temp to "+local_dir) # cleanup local if exists if (os.path.isdir(local_dir)): map(os.unlink, (os.path.join(local_dir, f) for f in os.listdir(local_dir))) # Input dataset print("read dataset table") read_time = time.time() dfs = spark.read.format("parquet").load(blob_in) dfs_timezones = spark.read.format("parquet").load(timezones_in) # apply partition filter dfs_state = dfs.where(f"state = '{state}'") print("processing with spark") spark_time = time.time() w = Window().partitionBy('userId').orderBy('begin') dfs_state = add_distance_column(dfs_state, order_column='begin') dfs_state = dfs_state.fillna(0, subset=['next_travelled_distance']) dfs_state = dfs_state.withColumn('lag_next_travelled_distance', F.lag( col('next_travelled_distance')).over(w)) dfs_state = dfs_state.withColumn('lag_end', F.lag('end').over(w)) dfs_state = dfs_state.withColumn('rn', F.when(((col('lag_next_travelled_distance') != col('prev_travelled_distance')) | (col('prev_travelled_distance') > 0) | (col('lag_next_travelled_distance') > 0) | (col('distance_prev') > roam_dist_events) | ((F.dayofyear(col('begin')) - F.dayofyear(col('lag_end')) == 1) & (F.hour(col('begin')) < 6)) ) & ((col('lag_end').isNull()) | (col('lag_end') < col('begin'))), 1).otherwise(0)) # Remove prev_travelled distance when rn == 0 (it happens when lag_end and begin overlap) dfs_state = dfs_state.withColumn('prev_travelled_distance', F.when( col('rn') == 0, 0).otherwise(col('prev_travelled_distance'))) w = Window().partitionBy('userId').orderBy( 'begin').rangeBetween(Window.unboundedPreceding, 0) dfs_state = dfs_state.withColumn('group', F.sum('rn').over(w)) dfs_state = dfs_state.groupBy('userId', 'group', 'state').agg(F.mean('latitude').alias('latitude'), F.mean('longitude').alias( 'longitude'), F.min('begin').alias( 'begin'), F.max('end').alias('end')).drop('group') dfs_destinations = get_destinations(dfs_state, roam_dist=roam_dist_stops) dfs_destinations = dfs_destinations.withColumn( 'prefix', dfs_destinations.userId.substr(1, 2)) dfs_destinations = dfs_destinations.withColumn( 'dayofyear', F.dayofyear('begin')) dfs_destinations = dfs_destinations.withColumn('year', F.year('begin')) # dfs_destinations = dfs_destinations.withColumn('state', F.lit(state)) # Local time dfs_destinations.createOrReplaceTempView("dfs_destinations") dfs_destinations = spark.sql(""" SELECT dfs_destinations_distinct.*, geohash(clusterLatitude, clusterLongitude, 7) as geohash7 from dfs_destinations """) dfs_destinations = dfs_destinations.withColumn('geohash5', F.substring(col('geohash7'), 1, 5)) dfs_destinations = dfs_destinations.join(F.broadcast(dfs_timezones), on='geohash5').drop('geohash5') dfs_destinations = dfs_destinations.withColumn('local_begin', F.from_utc_timestamp(col('begin'), col('tzid'))) dfs_destinations = dfs_destinations.withColumn('offset', ( (col('local_begin').cast('long') - col('begin').cast('long')) / 3600).cast('int')).drop('local_begin') dfs_destinations.persist(StorageLevel.DISK_ONLY) # Write local_dir_all = local_dir + "/all/" dfs_destinations_all = dfs_destinations.select( 'prefix', 'userId', 'clusterId', 'begin', 'end', 'offset', 'year', 'dayofyear') dfs_destinations_all.repartition(256, "prefix", "year", "dayofyear").write.partitionBy( "prefix", "year", "dayofyear").format('parquet').mode('overwrite').save(local_dir_all) local_dir_distinct = local_dir+"/distinct/" dfs_destinations_distinct = dfs_destinations.select( 'prefix', 'userId', 'clusterId', 'clusterLatitude', 'clusterLongitude', 'geohash7', 'year').distinct() dfs_destinations_distinct.repartition(256, "prefix", "year").write.partitionBy( "prefix", "year").format('parquet').mode('overwrite').save(local_dir_distinct) dfs_destinations.unpersist() print("upload local data to azure") upload_time = time.time() # upload parts 1 "prefix/year" print(f"upload files for distinct") # upload with threads dfutures = [] with ThreadPoolExecutor(max_workers=THREADS) as executor: years = [2020] s_key = f"state={state}" for fprefix in enumerate_prefixes(): print(f"upload files for distinct: {fprefix}") prefix_dir = local_dir_distinct+"prefix="+fprefix prefix_key = f"prefix={fprefix}" for fyear in years: f_dir = prefix_dir + "/year="+str(fyear) f_key = prefix_key + "/year="+str(fyear) # print(f"read files for distinct from {f_dir}") if (os.path.isdir(f_dir)): files = [filename for filename in os.listdir( f_dir) if filename.startswith("part-")] if len(files) > 0: for file_local in files: file_path = f_dir+"/"+file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(f_key+s_key+part_key)).hexdigest() blob_key = "{}/{}/{}/part-{}-{}.snappy.parquet".format( path_out_distinct, f_key, s_key, part_key, filename_hash) # print("upload " + file_path + " to " + container_out+":"+blob_key) # upload_blob(blob_service_client,container_out, blob_key, file_path) future = executor.submit( upload_blob, blob_service_client,container_out, blob_key, file_path) dfutures.append(future) # else: # print(f"no files to upload for {f_key}") # else: # print(f"missing partition for {f_key}") # end of loop, wait for futures for future in dfutures: bkey = future.result() # ensure we wait all tasks # TODO check if all done ddone = concurrent.futures.wait(dfutures) # upload parts 2 "prefix/year/dayofyear" print(f"upload files for all") years = [2020] s_key = f"state={state}" # upload with threads afutures = [] with ThreadPoolExecutor(max_workers=THREADS) as executor: for fprefix in enumerate_prefixes(): print(f"upload files for all: {fprefix}") prefix_dir = local_dir_all+"prefix="+fprefix prefix_key = f"prefix={fprefix}" for fyear in years: f_dir = prefix_dir + "/year="+str(fyear) f_key = prefix_key + "/year="+str(fyear) # print(f"read files for all from {f_dir}") for fday in range(1, 367): d_dir = f_dir + "/dayofyear="+str(fday) d_key = f_key + "/dayofyear="+str(fday) # print(f"read files for all from {d_dir}") if (os.path.isdir(d_dir)): files = [filename for filename in os.listdir( d_dir) if filename.startswith("part-")] if len(files) > 0: for file_local in files: file_path = d_dir+"/"+file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(d_key+s_key+part_key)).hexdigest() blob_key = "{}/{}/{}/part-{}-{}.snappy.parquet".format( path_out_all, d_key, s_key, part_key, filename_hash) # print("upload " + file_path + " to " + container_out+":"+blob_key) # upload_blob(blob_service_client,container_out, blob_key, file_path) future = executor.submit( upload_blob, blob_service_client,container_out, blob_key, file_path) afutures.append(future) # else: # print(f"no files to upload for {d_key}") # else: # print(f"missing partition for {d_key}") # end of loop, wait for futures for future in afutures: bkey = future.result() # ensure we wait all tasks # TODO check if all done adone = concurrent.futures.wait(afutures) print("--- {} seconds elapsed ---".format(int(time.time() - start_time))) print() shutdown_time = time.time() spark.stop() end_time = time.time() print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format( int(end_time - start_time), int(spark_time - read_time), int(upload_time - spark_time), int(shutdown_time - upload_time), int(end_time - shutdown_time) )) print('Done.')
def process_data(data_path, database, table): ''' Purpose: To Process the API results using SPARK and store in HDFS ''' try: logger('INFO', "LOADING the Data in Spark for Processing ") logger( 'INFO', "In case of java.lang.OutOfMemoryError Tune Spark Parameters in conf/proprties.yml" ) df = myspark.read.format("json").options( inferSchema=True, dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSSZZ", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, path="/tmp/tempfiles/").load() logger('INFO', "Changing the Data Type to Timestamp for few Columns") df = df.withColumn( "created", F.from_utc_timestamp( df.created, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())).withColumn( "lastModified", F.from_utc_timestamp( df.lastModified, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())).withColumn( "lastAccessed", F.from_utc_timestamp( df.lastAccessed, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast( T.TimestampType())) df = df.withColumn( "created", F.date_format(df.created, "yyyy-MM-dd HH:mm:ss")).withColumn( "lastModified", F.date_format(df.lastModified, "yyyy-MM-dd HH:mm:ss")).withColumn( "lastAccessed", F.date_format(df.lastAccessed, "yyyy-MM-dd HH:mm:ss")) logger('INFO', "Choosing the 11 required Columns ") finaldf = df.select("owner", "parentPath", "originalName", "created", "lastModified", "lastAccessed", "size", "sourceType", "permissions", "group", "identity") logger( 'INFO', "%s Records will be LOADED into %s.%s Table " % (finaldf.count(), database, table)) logger('INFO', "Create Table Command Prepared:") cmd = "CREATE TABLE IF NOT EXISTS %s.%s (owner STRING,parentPath STRING,originalName STRING,created timestamp,lastModified timestamp,lastAccessed timestamp,size BIGINT,sourceType STRING,permissions STRING,group STRING,identity BIGINT)STORED AS PARQUET" % ( database, table) logger('INFO', cmd) logger('INFO', "Creating the Table %s.%s if NOT EXISTS" % (database, table)) myspark.sql(cmd) finaldf.createOrReplaceTempView("mytable") logger('INFO', "INSERT INTO TABLE Table Command Prepared:") cmd = """INSERT INTO TABLE {database}.{table} SELECT owner,parentPath,originalName,created,lastModified,lastAccessed,size,sourceType,permissions,group,identity FROM mytable""".format( database=database, table=table) logger('INFO', cmd) logger('INFO', "Inserting the Data in %s.%s Table" % (database, table)) myspark.sql(cmd) logger( 'INFO', "%s Records Inserted successfully in %s.%s" % (finaldf.count(), database, table)) except Exception, e: logger('ERROR', "Job Failed with below Details !!") os.system("rm -f " + data_path + "/../pids/*.pid") logger('ERROR', "%s" % e) logger('ERROR', "Exiting !!") sys.exit()
def process_log_data(spark, input_data, output_data, song_df): """ Description: This function can be used to process log-data files from the given input path and transform the data from json files into users, time and songplays spark tables and writing these tables to the given output path as parquet tables. Arguments: spark: SparkSession object. input_data: Path to the input JSON files. output_data: Path to the output directory that stores output parquet tables. song_df: Song data dataframe. Returns: None. """ # get filepath to log data file log_data = input_data + 'log-data/2018/11' # define schema for log data file log_schema = t.StructType([ t.StructField("artist", t.StringType(), True), t.StructField("auth", t.StringType(), True), t.StructField("firstName", t.StringType(), True), t.StructField("gender", t.StringType(), True), t.StructField("itemInSession", t.IntegerType(), True), t.StructField("lastName", t.StringType(), True), t.StructField("length", t.DecimalType(12, 7), True), t.StructField("level", t.StringType(), True), t.StructField("location", t.StringType(), True), t.StructField("method", t.StringType(), True), t.StructField("page", t.StringType(), True), t.StructField("registration", t.DecimalType(16, 2), True), t.StructField("sessionId", t.IntegerType(), True), t.StructField("song", t.StringType(), True), t.StructField("status", t.IntegerType(), True), t.StructField("ts", t.LongType(), True), t.StructField("userAgent", t.StringType(), True), t.StructField("userId", t.StringType(), True) ]) # read log data file using schema df = spark \ .read \ .format("json") \ .schema(log_schema) \ .load(log_data) # filter by actions for song plays df = df \ .filter('page = "NextSong"') # group by userId for unique users users_list = df \ .groupBy('userId') \ .agg(f.max('ts').alias('ts')) # extract columns to create users table users_table = df \ .join(users_list, ['userId', 'ts'], 'inner') \ .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \ .dropDuplicates() # write users table to parquet files users_output = output_data + 'users' users_table \ .write \ .option("path", users_output) \ .saveAsTable('users', format='parquet') # create timestamp column from original timestamp column df = df \ .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC')) # create datetime column from original timestamp column get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), t.TimestampType()) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = df \ .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_output = output_data + 'time' time_table \ .write \ .partitionBy('year', 'month') \ .option("path", time_output) \ .saveAsTable('time', format='parquet') # join and extract columns from song and log datasets to create songplays table cond = [ df.artist == song_df.artist_name, df.song == song_df.title, df.length == song_df.duration ] songplays_df = df.join(song_df, cond, 'left') songplays_df = songplays_df \ .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month')) w = Window().orderBy(f.lit('A')) songplays_table = songplays_df.withColumn('songplay_id', f.row_number().over(w)) # write songplays table to parquet files partitioned by year and month songplays_output = output_data + 'songplays' songplays_table \ .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\ .write \ .partitionBy('year', 'month') \ .option("path", songplays_output) \ .saveAsTable('songplays', format='parquet')
seed = 21 # Para hacer la extracción aleatoria del 1% de datos fraction = 0.01 tweets_sample = tweets.sample(fraction, seed) print("Número de tweets muestreados: {0}".format(tweets_sample.count())) hiveContext.sql('DROP TABLE IF EXISTS tweets_sample') hiveContext.registerDataFrameAsTable(tweets_sample, "tweets_sample") # Estudiamos el número de tweets a cada hora del día tweets_timestamp = hiveContext.sql("SELECT created_at FROM tweets_sample") tweets_timestamp = tweets_timestamp.withColumn("hour", lit(hour(from_utc_timestamp(tweets_timestamp.created_at, 'GMT+1'))))\ .withColumn("day", lit(date_format(tweets_timestamp.created_at,'MM-dd-YY'))) tweets_hour_day = tweets_timestamp.groupBy("hour","day")\ .agg({"created_at":"count"})\ .orderBy("count(created_at)", ascending=False)\ .withColumnRenamed('count(created_at)','count') print("Número de tweets en cada hora en cada día:\n") tweets_hour_day.limit(20).show() tweets_hour = tweets_hour_day.drop("day")\ .groupBy("hour")\ .agg({"count": "sum"})\ .withColumnRenamed('sum(count)','tweets')\ .orderBy('tweets', ascending=False)\
# MAGIC <img alt="Opens in new tab" src="https://files.training.databricks.com/static/images/external-link-icon-16x16.png"/> Watch full-screen.</a> # MAGIC </div> # COMMAND ---------- # MAGIC %md-sandbox # MAGIC Take a look at the server-side errors by hour to confirm the data meets your expectations. Visualize it by selecting the bar graph icon once the table is displayed. <br><br> # MAGIC <div><img src="https://files.training.databricks.com/images/eLearning/ETL-Part-1/visualization.png" style="height: 400px" style="margin-bottom: 20px; height: 150px; border: 1px solid #aaa; border-radius: 10px 10px 10px 10px; box-shadow: 5px 5px 5px #aaa"/></div> # COMMAND ---------- from pyspark.sql.functions import from_utc_timestamp, hour, col countsDF = (serverErrorDF.select( hour(from_utc_timestamp( col("time"), "GMT")).alias("hour")).groupBy("hour").count().orderBy("hour")) display(countsDF) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC The distribution of errors by hour meets the expectations. There is an uptick in errors around midnight, possibly due to server maintenance at this time. # COMMAND ---------- # MAGIC %md-sandbox # MAGIC ### Saving Back to DBFS # MAGIC
time_df = spark.createDataFrame([(1428476400, )], ['unix_time']) # Expected: # +-------------------+ # | ts| # +-------------------+ # |2015-04-08 00:00:00| # +-------------------+ # Answer time_df.select(F.from_unixtime('unix_time').alias('ts')).show() spark.conf.unset("spark.sql.session.timeZone") # COMMAND ---------- # You have this dataframe, with a column for the datetime (given in UTC time), and the second for the time zone. Convert this datetime in local time, once for the PST (Pacific Standard Time) and in the second case use the time zone column tz df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) # Expected: # +-------------------+-------------------+ # | PST_time| local_time| # +-------------------+-------------------+ # |1997-02-28 02:30:00|1997-02-28 19:30:00| # +-------------------+-------------------+ # Answer df.select( F.from_utc_timestamp(df.ts, "PST").alias('PST_time'), F.from_utc_timestamp(df.ts, df.tz).alias('local_time')).show() # COMMAND ----------
.config("spark.sql.sources.partitionOverwriteMode","dynamic") .getOrCreate()) client = boto3.client('s3',aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name="us-west-2") srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day +"/tradeline_*" tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Stage/Parquet/tradeLine/" dfjson = sparkSession.read.format("json").option("multiline", "true").option("inferSchema", "true").load(srcfilePath) #data = dfjson.select(explode("DATA").alias("data")) data = dfjson.withColumn("data", explode("DATA")).select("data.*") # dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss")) dfPT = data.withColumn("createdDatePT",sf.from_utc_timestamp(sf.regexp_replace(data.createdDate,"T"," "),"US/Pacific")) df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select([col for col in df.columns if not col.startswith("xmlns")]) #dfbaseData.show(10,False) # dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \ # .where(sf.col("row_num") == 1) \ # .select(dfbaseData["*"]) dfbaseData.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \ .write.format("parquet") \
def get_timestamp_in_tz(unixtime_timestamp, ts_format, tz): return F.from_utc_timestamp(F.from_unixtime(unixtime_timestamp, ts_format), tz)
serverErrorDF = (logDF .filter((col("code") >= 500) & (col("code") < 600)) .select("date", "time", "extention", "code") ) # COMMAND ---------- display(serverErrorDF) serverErrorDF.count() # COMMAND ---------- from pyspark.sql.functions import from_utc_timestamp, hour, col countsDF = (serverErrorDF .select(hour(from_utc_timestamp(col("time"), "GMT")).alias("hour")) .groupBy("hour") .count() .orderBy("hour")) display(countsDF) # COMMAND ---------- (serverErrorDF .write .mode("overwrite") .parquet("/tmp" + username + "/log20170329/serverErrorDF.parquet") ) # COMMAND ----------
# save columns of entity columns = series_list[0].keys() # COMMAND ---------- # step through the day # ceate dataframe from data, convert to spark DF for saving and type conversion df = pd.DataFrame(series_list, columns=columns) dfclean = df.drop_duplicates() sparkDF = spark.createDataFrame(dfclean) # COMMAND ---------- sparkDF = sparkDF.withColumn('updatedAt', F.from_utc_timestamp('updatedAt', 'CST')).withColumn( 'createdAt', F.from_utc_timestamp( 'createdAt', 'CST')) display(sparkDF.select("id", "updatedAt", "createdAt").orderBy(F.col("id"))) num_files = sparkDF.count() # COMMAND ---------- print(num_files) # COMMAND ---------- from pyspark.sql.types import *
@sf.udf(st.ArrayType(st.StringType())) def flatten(hotels): return reduce(lambda x,y:x+y, hotels) # COMMAND ---------- _searches = hotels_e_search \ .filter((sf.col('day') >= startDate) & (sf.col('day') <= endDate)) \ .filter(sf.col('vcid_key') == vcid) \ .filter(sf.col('inputData.trackingId').isNotNull()) \ .filter(sf.size(sf.col('outputData.rankList.ranks')) > 0) \ .withColumn('sort', get_sort(sf.col('inputData.params'))) \ .withColumn('filters', get_filters(sf.col('inputData.filterInput'))) \ .withColumn('algo', get_algo(sf.col('outputData.augurConfigID'))) \ .select(sf.col('day').alias('searchDay'), sf.from_utc_timestamp(sf.col('eventTime').cast(st.TimestampType()), 'IST').alias('searchTime'), sf.col('currentTime').alias('currentSearchTime'), sf.col('vcid_key').alias('vcid'), sf.col('inputData.flavour').alias('flavour'), sf.col('inputData.checkIn').alias('checkIn'), sf.col('inputData.checkOut').alias('checkOut'), sf.col('inputData.pax').alias('pax'), sf.col('inputData.userId').alias('userId'), sf.col('inputData.email').alias('email'), sf.col('inputData.trackingId').alias('trackingId'), sf.col('sort'), sf.col('filters'), sf.col('inputData.filterInput').alias('filterInput'), sf.col('algo'), sf.col('pid'), sf.col('outputData.rankList.ranks').alias('ranks'))
def ingest(self, src_resource=None, src_path=None, src_provider=None, dest_resource=None, dest_path=None, dest_provider=None, eventsourcing=False): logger = logging.getLogger() #### contants: now = datetime.now() reserved_cols = ['_ingested', '_date', '_state'] #### Source metadata: md_src = data.metadata(src_resource, src_path, src_provider) if not md_src: logger.error("No metadata") return # filter settings from src (provider and resource) filter_params = utils.merge( md_src['provider'].get('read', {}).get('filter', {}), md_src['resource'].get('read', {}).get('filter', {})) #### Target metadata: # default path for destination is src path if (not dest_resource) and (not dest_path) and dest_provider: dest_path = md_src['resource']['path'] md_dest = data.metadata(dest_resource, dest_path, dest_provider) if not md_dest: return if 'read' not in md_dest['resource']: md_dest['resource']['read'] = {} # match filter with the one from source resource md_dest['resource']['read']['filter'] = filter_params #### Read source resource try: df_src = self._read(md_src) except Exception as e: logger.exception(e) return #### Read destination schema info try: schema_path = '{}/schema'.format(md_dest['resource']['path']) md = data.metadata(path=schema_path, provider=dest_provider) df_schema = self._read(md) schema_date_str = df_schema.sort( desc("date")).limit(1).collect()[0]['id'] except Exception as e: # logger.warning('source schema does not exist yet.'') schema_date_str = now.strftime('%Y%m%dT%H%M%S') # destination path - append schema date dest_path = '{}/{}'.format(md_dest['resource']['path'], schema_date_str) md_dest['resource']['path'] = dest_path md_dest['url'] = data._url(md_dest) # if schema not present or schema change detected schema_changed = True try: df_dest = self._read(md_dest) # compare schemas df_src_cols = [x for x in df_src.columns if x not in reserved_cols] df_dest_cols = [ x for x in df_dest.columns if x not in reserved_cols ] schema_changed = df_src[df_src_cols].schema.json( ) != df_dest[df_dest_cols].schema.json() except Exception as e: # logger.warning('schema does not exist yet.'') df_dest = df_src.filter("False") if schema_changed: # Different schema, update schema table with new entry schema_entry = (schema_date_str, now, df_src.schema.json()) df_schema = self.context().createDataFrame( [schema_entry], ['id', 'date', 'schema']) # write the schema to destination provider md = data.metadata(path=schema_path, provider=md_dest['resource']['provider']) self._write(df_schema, md, mode='append') # partitions partition_cols = ['_ingested'] #init df_diff to empty dest dataframe df_diff = df_dest.filter("False") if not eventsourcing: if filter_params.get('policy') == 'date' and filter_params.get( 'column'): df_diff = dataframe_update(df_src, df_dest, updated_col='_ingested', eventsourcing=eventsourcing) df_diff = df_diff.withColumn( '_date', date_format( from_utc_timestamp(filter_params['column'], 'GMT+7'), 'yyyy-MM-dd')) partition_cols += ['_date'] ingest_mode = 'append' options = {'mode': ingest_mode, 'partitionBy': partition_cols} else: df_diff = dataframe_update(df_src, df_dest.filter("False"), updated_col='_ingested', eventsourcing=eventsourcing) ingest_mode = 'overwrite' options = {'mode': ingest_mode, 'partitionBy': partition_cols} else: # to do logger.fatal('event sourcing not implemented yet') records_add = df_diff.filter("_state = 0").count() records_del = df_diff.filter("_state = 1").count() if records_add or records_del or schema_changed: md = data.metadata(path=dest_path, provider=md_dest['resource']['provider']) self._write(df_diff, md, **options) end = datetime.now() time_diff = end - now logdata = { 'src_url': md_src['url'], 'src_table': md_src['resource']['path'], 'source_option': filter_params, 'schema_change': schema_changed, 'target': dest_path, 'upserts': records_add, 'deletes': records_del, 'diff_time': time_diff.total_seconds() } logtype = { 'dlf_type': '{}.{}'.format(self.__class__.__name__, func_name()) } logger.info(logdata, extra=logtype)