def select_type2(df, *type2): """Select fields for log format; 155138 :param df: Input DataFrame :param type2: A list of shopping_sites_id :return: Output DataFrame """ stage_df = ( df.filter(df.logtype.isin('view') & df.info.siteseq.isin(*type2)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'og:url', 'og:title').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq')) stage_df = stage_df.withColumn('productCode', split(stage_df['productCode'], '/')) return (df.filter( df.logtype.isin('login', 'purchase', 'cart') & df.info.siteseq.isin(*type2)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'productCode', 'productName').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq').unionAll((stage_df.select( 'maid', 'siteseq', 'userid', 'timestamp', 'logtype', element_at(stage_df.productCode, -1).alias('productCode'), 'productName'))))
def select_type1(df, *type1): """Select fields for log format; -48 :param df: Input DataFrame :param type1: A list of shopping_sites_id :return: Output DataFrame """ return (df.filter( df.logtype.isin('login', 'purchase') & df.info.siteseq.isin(*type1)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'goodsCode', 'goodsName').alias('productCode', 'productName') ).withColumnRenamed('info.siteseq', 'siteseq').unionAll( (df.filter(df.logtype.isin('cart') & df.info.siteseq.isin(*type1)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'goodsCode', 'name').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq')) ).unionAll( (df.filter(df.logtype.isin('view') & df.info.siteseq.isin(*type1)).select( 'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'tas:productCode', 'og:title').alias( 'productCode', 'productName')).withColumnRenamed( 'info.siteseq', 'siteseq'))))
def users_set_price_filter_in_euro(data: DataFrame): return data \ .select("*", functions.json_tuple("params", "event_content", "page").alias("event_content", "page")) \ .select(col("session_id"), col("event_content")) \ .filter(col("event_content") != lit("{}")) \ .select("*", functions.json_tuple("event_content", "price", "currency").alias("price", "currency")) \ .drop(col("event_content")) \ .filter(col("currency") == lit("EUR"))
def user_session_contains_request_id(data: DataFrame): return data \ .select("*", functions.json_tuple("params", "event_content", "page").alias("event_content", "page")) \ .select(col("session_id"), col("page")) \ .filter(col("page") != lit("{}")) \ .select("*", functions.json_tuple("page", "request_id").alias("request_id")) \ .drop(col("page")) \ .filter(col("request_id") != lit("{}")) \ .groupBy("session_id") \ .count().count()
def process_row(df, epochId): df = df.na.drop() df = df.selectExpr("CAST(value AS STRING)") # Explode the microbatch into rows df = df.select(fn.explode(fn.split(df.value, "\n")).alias("value")) # Explode JSON into columns ds = df.select(fn.json_tuple('value', 'uuid', 'document_id','timestamp','platform','geo_location','traffic_source') \ .alias('uuid', 'document_id','timestamp','platform','geo_location','traffic_source')) # Do processing ds = ds.na.drop() # Get Top 3 Websites with high user traffic ds1 = ds.groupby(fn.col('document_id')) \ .agg( fn.count(fn.col('uuid')), fn.max(fn.col('timestamp').cast('string'))) \ .withColumnRenamed("COUNT(uuid)", "users") \ .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \ .orderBy(fn.col("users"), ascending=False).limit(3) # Get platform distribution ds2 = ds.groupby(fn.col('platform')) \ .agg( fn.count(fn.col('uuid')), fn.max(fn.col('timestamp').cast('string'))) \ .withColumnRenamed("COUNT(uuid)", "users") \ .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \ .orderBy(fn.col("users"), ascending=False) # Get traffic distribution ds3 = ds.groupby(fn.col('traffic_source')) \ .agg( fn.count(fn.col('uuid')), fn.max(fn.col('timestamp').cast('string'))) \ .withColumnRenamed("COUNT(uuid)", "users") \ .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \ .orderBy(fn.col("users"), ascending=False) ds.printSchema() # Write to db if (df.rdd.isEmpty()): #do nothing pass else: ds1.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project", table="page_views", mode="overwrite", properties=db_properties) ds2.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project", table="page_views_platform", mode="overwrite", properties=db_properties) ds3.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project", table="page_views_traffic", mode="overwrite", properties=db_properties) pass
def process_row(df, epochId): df = df.na.drop() df = df.selectExpr("CAST(value AS STRING)") # Explode the microbatch into rows df = df.select(fn.explode(fn.split(df.value, "\n")).alias("value")) # Explode JSON into columns ds = df.select(fn.json_tuple('value', 'display_id', 'ad_id','clicked','uuid','timestamp','document_id','platform','geo_location') \ .alias('display_id', 'ad_id','clicked','uuid','timestamp','document_id','platform','geo_location')) # Do processing ds = ds.groupby(fn.col('ad_id')) \ .agg( fn.max(fn.col('timestamp').cast('string')), fn.sum(fn.col('clicked')), fn.count(fn.col('clicked'))) \ .withColumnRenamed("SUM(clicked)", "clicks") \ .withColumnRenamed("COUNT(clicked)", "views") \ .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") ds.printSchema() # Write to db if (df.rdd.isEmpty()): #do nothing pass else: ds.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project", table="mytopic", mode="append", properties=db_properties) pass
def main(spark, kafka_conf): bootstrap_servers = ','.join(kafka_conf['bootstrap_servers']) tweet_stream = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", bootstrap_servers) \ .option("subscribe", ','.join(kafka_conf['topics_request'])) \ .load() # watermark so we don't keep track of data too long (we don't need it anyway) # tweet_stream = tweet_stream.withWatermark('timestamp', '20 seconds') tweet_stream = tweet_stream.select( concat(tweet_stream.topic, lit('-sentiment')).alias('topic'), json_tuple(tweet_stream.value.cast('string'), 'tweet', 'created_at'), 'timestamp' ).withColumnRenamed('c0', 'tweet').withColumnRenamed('c1', 'created_at') tweet_stream = tweet_stream.withColumn( "sentiment_score", sentiment_udf( regexp_replace(tweet_stream.tweet, r'(#\w+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ') ) ) tweet_stream = tweet_stream.select( 'topic', to_json(struct('sentiment_score', 'created_at', 'tweet')).alias('value'), 'timestamp' ) query = tweet_stream.writeStream.outputMode('append')\ .format('kafka')\ .option('checkpointLocation', False)\ .option('kafka.bootstrap.servers', bootstrap_servers)\ .start() # run forever until dead query.awaitTermination()
def m2(brokers, topic): spark = SparkSession.builder.enableHiveSupport().getOrCreate() df = spark.readStream.format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .option("startingOffsets", "earliest") \ .load() \ .withColumn("value", col("value").cast("string").alias("value")) \ .select(col("value")) \ .select(json_tuple(col("value"), "event_time", "clnt").alias("a", "b")) def transform(batch_df, batch_id): batch_df.show() df \ .writeStream \ .outputMode("update") \ .format("console") \ .foreachBatch(transform) \ .trigger(processingTime="2 seconds") \ .option("checkpointLocation", "/tmp/sb") \ .option("truncate", "true") \ .start() \ .awaitTermination()
def select_default(df, *default): """Select fields for default log format; 154992 :param df: Input DataFrame :param default: A list of shopping_sites_id :return: Output DataFrame """ return ( df .filter(df.logtype.isin('login', 'purchase', 'cart') & df.info.siteseq.isin(*default)) .select('maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'productCode', 'productName').alias('productCode', 'productName')) .withColumnRenamed('info.siteseq', 'siteseq') .unionAll(( df .filter(df.logtype.isin('view') & df.info.siteseq.isin(*default)) .select('maid', 'info.siteseq', 'userid', 'timestamp', 'logtype', json_tuple(df.custom, 'rb:itemId', 'rb:itemName').alias('productCode', 'productName')) .withColumnRenamed('info.siteseq', 'siteseq'))))
spark.sparkContext.setLogLevel("ERROR") # load the streaming dataframe kafkaStream = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", f"{KAFKA_HOST}:{KAFKA_PORT}") \ .option("subscribe", "twitter-stream-input") \ .option("failOnDataLoss", "false")\ .load() # parse value column to json and extract id and entities columns kafkaStream = kafkaStream.withColumn("value", col("value").cast(StringType()))\ .withColumn("value", regexp_replace(col("value"), r"\"source\".*\"truncated\"", "\"truncated\""))\ .select(json_tuple(col("value"), "created_at", "id", "entities"))\ .withColumnRenamed("c0", "created_at")\ .withColumnRenamed("c1", "id")\ .withColumnRenamed("c2", "entities") # filter the null rows (those are the deleted tweets) filteredKafkaStream = kafkaStream.where(col("id").isNotNull()) # extract hashtags from entities and explode arrays hashtagsStream = filteredKafkaStream.withColumn("hashtags", json_tuple(col("entities"), "hashtags"))\ .withColumn("hashtags", regexp_replace(col("hashtags"), r"^\[", ""))\ .withColumn("hashtags", regexp_replace(col("hashtags"), r"\]$", ""))\ .withColumn("hashtags", split(col("hashtags"), "},"))\ .withColumn("hashtag", explode("hashtags"))\ .withColumn("hashtag", regexp_replace(col("hashtag"), r"^(\{\"text\"):(.*)(,\"indices\".*)", "$2"))\ .withColumn("hashtag", when(col("hashtag") == "", None).otherwise(col("hashtag")))\
.selectExpr("explode(complex_map)").show(2) # COMMAND ---------- jsonDF = spark.range(1).selectExpr(""" '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""") # COMMAND ---------- from pyspark.sql.functions import get_json_object, json_tuple jsonDF.select( get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column", json_tuple(col("jsonString"), "myJSONKey")).show(2) # COMMAND ---------- from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))) # COMMAND ---------- from pyspark.sql.functions import from_json from pyspark.sql.types import * parseSchema = StructType(( StructField("InvoiceNo",StringType(),True),
from pyspark.sql import SparkSession from pyspark.sql.functions import col,get_json_object,json_tuple spark = SparkSession.builder.appName("Pyspark example").getOrCreate() jsonDF= spark.range(1).selectExpr(""" '{"key": {"value":[1,2,3]}}' as jsonString """) jsonDF.select( get_json_object(col("jsonString"),"$.key.value[0]").alias("column"), json_tuple(col("jsonString"),"key")).show(2)
from pyspark.sql import SparkSession from pyspark.sql.functions import col, struct, split, explode, expr, create_map, get_json_object, json_tuple if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.range(1).selectExpr(""" '{"myJsonKey": {"myJsonValue": [1, 2, 3]}}' as jsonString """) df.select(get_json_object(col('jsonString'), "$.myJsonKey.myJsonValue[0]")).show() df.select(json_tuple(col('jsonString'), "myJsonKey")).show()
save_file = "/results/httptraffic/%s" % date hdfs_host = "hdfs://localhost:9000" file1 = "%s%s_src_traffic.csv" % (hdfs_host, save_file) file2 = "%s%s_agent_url_traffic.csv" % (hdfs_host, save_file) print("save src traffic file: %s" % file1) print("save agent url traffic file: %s" % file2) sc = SparkContext(appName="HttpTraffic") sqlContext = SQLContext(sc) df = sqlContext.read.text(hdfs_host + log_file) df = df.select(F.split(df.value, ' - ').alias('infos')) df = df.select(df.infos[0].alias('date_str'), df.infos[1].alias('json_str')) df = df.select( F.json_tuple(df.json_str, 'src_name', 'flow_size', 'user_agent', 'url')) # flow_size bytes df = df.select(df.c0.alias('src_name'), \ F.udf(cal_flow_size, LongType())(df.c1).alias('flow_size'), \ df.c2.alias('user_agent'), \ F.udf(get_url_host, StringType())(df.c3).alias('url')) df1 = df.groupBy('src_name').sum('flow_size').withColumnRenamed( 'sum(flow_size)', 'flow_size') df1 = df1.sort(df1.flow_size.desc()) df1.write.csv(path=file1, mode="overwrite") df2 = df.groupBy('user_agent', 'url').sum('flow_size').withColumnRenamed( 'sum(flow_size)', 'flow_size') df2 = df2.sort(df2.flow_size.desc())