示例#1
0
def select_type2(df, *type2):
    """Select fields for log format; 155138

    :param df: Input DataFrame
    :param type2: A list of shopping_sites_id
    :return: Output DataFrame
    """
    stage_df = (
        df.filter(df.logtype.isin('view')
                  & df.info.siteseq.isin(*type2)).select(
                      'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
                      json_tuple(df.custom, 'og:url', 'og:title').alias(
                          'productCode', 'productName')).withColumnRenamed(
                              'info.siteseq', 'siteseq'))
    stage_df = stage_df.withColumn('productCode',
                                   split(stage_df['productCode'], '/'))

    return (df.filter(
        df.logtype.isin('login', 'purchase', 'cart')
        & df.info.siteseq.isin(*type2)).select(
            'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
            json_tuple(df.custom, 'productCode', 'productName').alias(
                'productCode', 'productName')).withColumnRenamed(
                    'info.siteseq', 'siteseq').unionAll((stage_df.select(
                        'maid', 'siteseq', 'userid', 'timestamp', 'logtype',
                        element_at(stage_df.productCode,
                                   -1).alias('productCode'), 'productName'))))
示例#2
0
def select_type1(df, *type1):
    """Select fields for log format; -48

    :param df: Input DataFrame
    :param type1: A list of shopping_sites_id
    :return: Output DataFrame
    """
    return (df.filter(
        df.logtype.isin('login', 'purchase')
        & df.info.siteseq.isin(*type1)).select(
            'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
            json_tuple(df.custom, 'goodsCode',
                       'goodsName').alias('productCode', 'productName')
        ).withColumnRenamed('info.siteseq', 'siteseq').unionAll(
            (df.filter(df.logtype.isin('cart')
                       & df.info.siteseq.isin(*type1)).select(
                           'maid', 'info.siteseq', 'userid', 'timestamp',
                           'logtype',
                           json_tuple(df.custom, 'goodsCode', 'name').alias(
                               'productCode',
                               'productName')).withColumnRenamed(
                                   'info.siteseq', 'siteseq'))
        ).unionAll(
            (df.filter(df.logtype.isin('view')
                       & df.info.siteseq.isin(*type1)).select(
                           'maid', 'info.siteseq', 'userid', 'timestamp',
                           'logtype',
                           json_tuple(df.custom, 'tas:productCode',
                                      'og:title').alias(
                                          'productCode',
                                          'productName')).withColumnRenamed(
                                              'info.siteseq', 'siteseq'))))
def users_set_price_filter_in_euro(data: DataFrame):
    return data \
        .select("*", functions.json_tuple("params", "event_content", "page").alias("event_content", "page")) \
        .select(col("session_id"), col("event_content")) \
        .filter(col("event_content") != lit("{}")) \
        .select("*", functions.json_tuple("event_content", "price", "currency").alias("price", "currency")) \
        .drop(col("event_content")) \
        .filter(col("currency") == lit("EUR"))
def user_session_contains_request_id(data: DataFrame):
    return data \
        .select("*", functions.json_tuple("params", "event_content", "page").alias("event_content", "page")) \
        .select(col("session_id"), col("page")) \
        .filter(col("page") != lit("{}")) \
        .select("*", functions.json_tuple("page", "request_id").alias("request_id")) \
        .drop(col("page")) \
        .filter(col("request_id") != lit("{}")) \
        .groupBy("session_id") \
        .count().count()
示例#5
0
def process_row(df, epochId):
    df = df.na.drop()
    df = df.selectExpr("CAST(value AS STRING)")

    # Explode the microbatch into rows
    df = df.select(fn.explode(fn.split(df.value, "\n")).alias("value"))

    # Explode JSON into columns
    ds = df.select(fn.json_tuple('value', 'uuid', 'document_id','timestamp','platform','geo_location','traffic_source') \
            .alias('uuid', 'document_id','timestamp','platform','geo_location','traffic_source'))

    # Do processing
    ds = ds.na.drop()
    # Get Top 3 Websites with high user traffic
    ds1 = ds.groupby(fn.col('document_id')) \
            .agg(
                fn.count(fn.col('uuid')),
                fn.max(fn.col('timestamp').cast('string'))) \
            .withColumnRenamed("COUNT(uuid)", "users") \
            .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \
            .orderBy(fn.col("users"), ascending=False).limit(3)

    # Get platform distribution
    ds2 = ds.groupby(fn.col('platform')) \
            .agg(
                fn.count(fn.col('uuid')),
                fn.max(fn.col('timestamp').cast('string'))) \
            .withColumnRenamed("COUNT(uuid)", "users") \
            .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \
            .orderBy(fn.col("users"), ascending=False)

    # Get traffic distribution
    ds3 = ds.groupby(fn.col('traffic_source')) \
            .agg(
                fn.count(fn.col('uuid')),
                fn.max(fn.col('timestamp').cast('string'))) \
            .withColumnRenamed("COUNT(uuid)", "users") \
            .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp") \
            .orderBy(fn.col("users"), ascending=False)

    ds.printSchema()

    # Write to db
    if (df.rdd.isEmpty()):
        #do nothing
        pass
    else:
        ds1.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project",
                       table="page_views",
                       mode="overwrite",
                       properties=db_properties)
        ds2.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project",
                       table="page_views_platform",
                       mode="overwrite",
                       properties=db_properties)
        ds3.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project",
                       table="page_views_traffic",
                       mode="overwrite",
                       properties=db_properties)
        pass
示例#6
0
def process_row(df, epochId):
    df = df.na.drop()
    df = df.selectExpr("CAST(value AS STRING)")

    # Explode the microbatch into rows
    df = df.select(fn.explode(fn.split(df.value, "\n")).alias("value"))

    # Explode JSON into columns
    ds = df.select(fn.json_tuple('value', 'display_id', 'ad_id','clicked','uuid','timestamp','document_id','platform','geo_location') \
            .alias('display_id', 'ad_id','clicked','uuid','timestamp','document_id','platform','geo_location'))

    # Do processing
    ds = ds.groupby(fn.col('ad_id')) \
            .agg(
                fn.max(fn.col('timestamp').cast('string')),
                fn.sum(fn.col('clicked')),
                fn.count(fn.col('clicked'))) \
            .withColumnRenamed("SUM(clicked)", "clicks") \
            .withColumnRenamed("COUNT(clicked)", "views") \
            .withColumnRenamed("max(CAST(timestamp AS STRING))", "timestamp")

    ds.printSchema()

    # Write to db
    if (df.rdd.isEmpty()):
        #do nothing
        pass
    else:
        ds.write.jdbc(url="jdbc:mysql://54.193.71.186:3306/Project",
                      table="mytopic",
                      mode="append",
                      properties=db_properties)
    pass
示例#7
0
def main(spark, kafka_conf):
	bootstrap_servers = ','.join(kafka_conf['bootstrap_servers'])
	tweet_stream = spark \
		.readStream \
		.format("kafka") \
		.option("kafka.bootstrap.servers", bootstrap_servers) \
		.option("subscribe", ','.join(kafka_conf['topics_request'])) \
		.load()
	
	# watermark so we don't keep track of data too long (we don't need it anyway)
	# tweet_stream = tweet_stream.withWatermark('timestamp', '20 seconds')
	tweet_stream = tweet_stream.select(
		concat(tweet_stream.topic, lit('-sentiment')).alias('topic'),
		json_tuple(tweet_stream.value.cast('string'), 'tweet', 'created_at'),
		'timestamp'
	).withColumnRenamed('c0', 'tweet').withColumnRenamed('c1', 'created_at')

	tweet_stream = tweet_stream.withColumn(
		"sentiment_score", sentiment_udf(
			regexp_replace(tweet_stream.tweet, r'(#\w+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ')
		)
	)
	tweet_stream = tweet_stream.select(
		'topic',
		to_json(struct('sentiment_score', 'created_at', 'tweet')).alias('value'),
		'timestamp'
	)
	query = tweet_stream.writeStream.outputMode('append')\
		.format('kafka')\
		.option('checkpointLocation', False)\
		.option('kafka.bootstrap.servers', bootstrap_servers)\
		.start()
	# run forever until dead
	query.awaitTermination()
示例#8
0
def m2(brokers, topic):
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()

    df = spark.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .option("startingOffsets", "earliest") \
        .load() \
        .withColumn("value", col("value").cast("string").alias("value")) \
        .select(col("value")) \
        .select(json_tuple(col("value"), "event_time", "clnt").alias("a", "b"))

    def transform(batch_df, batch_id):
        batch_df.show()

    df \
        .writeStream \
        .outputMode("update") \
        .format("console") \
        .foreachBatch(transform) \
        .trigger(processingTime="2 seconds") \
        .option("checkpointLocation", "/tmp/sb") \
        .option("truncate", "true") \
        .start() \
        .awaitTermination()
示例#9
0
def select_default(df, *default):
    """Select fields for default log format; 154992

    :param df: Input DataFrame
    :param default: A list of shopping_sites_id
    :return: Output DataFrame
    """
    return (
        df
        .filter(df.logtype.isin('login', 'purchase', 'cart') & df.info.siteseq.isin(*default))
        .select('maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
                json_tuple(df.custom, 'productCode', 'productName').alias('productCode', 'productName'))
        .withColumnRenamed('info.siteseq', 'siteseq')
        .unionAll((
            df
            .filter(df.logtype.isin('view') & df.info.siteseq.isin(*default))
            .select('maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
                    json_tuple(df.custom, 'rb:itemId', 'rb:itemName').alias('productCode', 'productName'))
            .withColumnRenamed('info.siteseq', 'siteseq'))))
示例#10
0
spark.sparkContext.setLogLevel("ERROR")

# load the streaming dataframe
kafkaStream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", f"{KAFKA_HOST}:{KAFKA_PORT}") \
  .option("subscribe", "twitter-stream-input") \
  .option("failOnDataLoss", "false")\
  .load()

# parse value column to json and extract id and entities columns
kafkaStream = kafkaStream.withColumn("value", col("value").cast(StringType()))\
                         .withColumn("value", regexp_replace(col("value"), r"\"source\".*\"truncated\"", "\"truncated\""))\
                         .select(json_tuple(col("value"), "created_at", "id", "entities"))\
                         .withColumnRenamed("c0", "created_at")\
                         .withColumnRenamed("c1", "id")\
                         .withColumnRenamed("c2", "entities")

# filter the null rows (those are the deleted tweets)
filteredKafkaStream = kafkaStream.where(col("id").isNotNull())

# extract hashtags from entities and explode arrays
hashtagsStream = filteredKafkaStream.withColumn("hashtags", json_tuple(col("entities"), "hashtags"))\
                                    .withColumn("hashtags", regexp_replace(col("hashtags"), r"^\[", ""))\
                                    .withColumn("hashtags", regexp_replace(col("hashtags"), r"\]$", ""))\
                                    .withColumn("hashtags", split(col("hashtags"), "},"))\
                                    .withColumn("hashtag", explode("hashtags"))\
                                    .withColumn("hashtag", regexp_replace(col("hashtag"), r"^(\{\"text\"):(.*)(,\"indices\".*)", "$2"))\
                                    .withColumn("hashtag", when(col("hashtag") == "", None).otherwise(col("hashtag")))\
  .selectExpr("explode(complex_map)").show(2)


# COMMAND ----------

jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")


# COMMAND ----------

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)


# COMMAND ----------

from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
示例#12
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,get_json_object,json_tuple
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()

jsonDF= spark.range(1).selectExpr(""" 
	'{"key":
			{"value":[1,2,3]}}' 
					as jsonString """)

jsonDF.select(
	get_json_object(col("jsonString"),"$.key.value[0]").alias("column"),
	json_tuple(col("jsonString"),"key")).show(2)
示例#13
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, split, explode, expr, create_map, get_json_object, json_tuple

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.range(1).selectExpr("""
        '{"myJsonKey": {"myJsonValue": [1, 2, 3]}}' as jsonString
    """)

    df.select(get_json_object(col('jsonString'),
                              "$.myJsonKey.myJsonValue[0]")).show()
    df.select(json_tuple(col('jsonString'), "myJsonKey")).show()
示例#14
0
save_file = "/results/httptraffic/%s" % date

hdfs_host = "hdfs://localhost:9000"
file1 = "%s%s_src_traffic.csv" % (hdfs_host, save_file)
file2 = "%s%s_agent_url_traffic.csv" % (hdfs_host, save_file)
print("save src traffic file: %s" % file1)
print("save agent url traffic file: %s" % file2)

sc = SparkContext(appName="HttpTraffic")
sqlContext = SQLContext(sc)

df = sqlContext.read.text(hdfs_host + log_file)
df = df.select(F.split(df.value, ' - ').alias('infos'))
df = df.select(df.infos[0].alias('date_str'), df.infos[1].alias('json_str'))
df = df.select(
    F.json_tuple(df.json_str, 'src_name', 'flow_size', 'user_agent', 'url'))

# flow_size bytes
df = df.select(df.c0.alias('src_name'), \
               F.udf(cal_flow_size, LongType())(df.c1).alias('flow_size'), \
               df.c2.alias('user_agent'), \
               F.udf(get_url_host, StringType())(df.c3).alias('url'))

df1 = df.groupBy('src_name').sum('flow_size').withColumnRenamed(
    'sum(flow_size)', 'flow_size')
df1 = df1.sort(df1.flow_size.desc())
df1.write.csv(path=file1, mode="overwrite")

df2 = df.groupBy('user_agent', 'url').sum('flow_size').withColumnRenamed(
    'sum(flow_size)', 'flow_size')
df2 = df2.sort(df2.flow_size.desc())