def _read_stream_kafka(self, stream, message_format, schema, include_metadata): kafka_cols = [ col("key"), col("topic"), col("partition"), col("offset"), col("timestamp"), col("timestampType"), ] if message_format == "avro" and schema is not None: # check if vallid avro schema avro.schema.parse(schema) df = stream.load() if include_metadata is True: return df.select(*kafka_cols, from_avro(df.value, schema).alias("value")).select( *kafka_cols, col("value.*")) return df.select(from_avro(df.value, schema).alias("value")).select( col("value.*")) elif message_format == "json" and schema is not None: df = stream.load() if include_metadata is True: return df.select( *kafka_cols, from_json(df.value.cast("string"), schema).alias("value")).select( *kafka_cols, col("value.*")) return df.select( from_json(df.value.cast("string"), schema).alias("value")).select(col("value.*")) if include_metadata is True: return stream.load() return stream.load().select("key", "value")
{"name":"fare_amount","type":["double","null"]}, {"name":"extra","type":["double","null"]}, {"name":"mta_tax","type":["double","null"]}, {"name":"tip_amount","type":["double","null"]}, {"name":"tolls_amount","type":["double","null"]}, {"name":"ehail_fee","type":["double","null"]}, {"name":"improvement_surcharge","type":["double","null"]}, {"name":"total_amount","type":["double","null"]}, {"name":"payment_type","type":["int","null"]}, {"name":"trip_type","type":["int","null"]}, {"name":"congestion_surcharge","type":["double","null"]} ] } """ test_read_avro_df = eh_green_avro_df.select( from_avro(col("body"), green_avro_schema).alias("body")) display(test_read_avro_df) # COMMAND ---------- # MAGIC %md # MAGIC #### Spark EventHubs Connector PySpark doc # MAGIC https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md # COMMAND ---------- import datetime eh_write_conf = { 'eventhubs.connectionString': eh_connection_encrypted, 'eventhubs.operationTimeout':
green_ehub_df = (spark.readStream.format("kafka").option( "subscribe", kafka_topic).option( "kafka.bootstrap.servers", kafka_bootstrap_servers).option( "kafka.sasl.mechanism", "PLAIN").option("kafka.security.protocol", "SASL_SSL").option( "kafka.sasl.jaas.config", kafka_sasl_jaas_config).option( "kafka.session.timeout.ms", "60000").option( "kafka.request.timeout.ms", "30000").option("kafka.group.id", "$Default").option( "failOnDataLoss", "false").option( "startingOffsets", "earliest").load().withColumn( "value", from_avro( col("value"), green_avro_schema)).select("value.*")) # COMMAND ---------- # MAGIC %md # MAGIC ## EventHubs stream into TripData Bronze Delta sink # COMMAND ---------- (green_ehub_df.withColumn("color", lit("green")).withColumnRenamed( "lpep_pickup_datetime", "pep_pickup_datetime").withColumnRenamed( "lpep_dropoff_datetime", "pep_dropoff_datetime").writeStream.format("delta"). option("checkpointLocation", f"abfss://lake@{lake_name}/bronze/taxidemo/tripdata/green.checkpoint").
.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-avro_2.12:3.0.1") \ .getOrCreate() logger = Log4j(spark) kafka_source_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "invoice-items") \ .option("startingOffsets", "earliest") \ .load() avroSchema = open('schema/invoice-items', mode='r').read() value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value")) rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \ .groupBy("value.CustomerCardNo") \ .agg(sum("value.TotalValue").alias("TotalPurchase"), sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards")) kafka_target_df = rewards_df.select(expr("CustomerCardNo as key"), to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value")) # kafka_target_df.show(truncate=False) rewards_writer_query = kafka_target_df \ .writeStream \ .queryName("Rewards Writer") \ .format("kafka") \
.option("includeHeaders", "true") \ .load() \ .select(col("topic"), col("partition"), col("offset"), find_header_value(col("headers"), lit("specversion")).cast("string").alias("specversion"), col("headers")[1]["value"].cast("string").alias("type"), col("headers")[2]["value"].cast("string").alias("source"), col("headers")[3]["value"].cast("string").alias("id"), col("headers")[4]["value"].cast("string").cast("timestamp").alias("time"), col("headers")[5]["value"].cast("string").alias("messageversion"), col("headers")[6]["value"].cast("string").alias("eventversion"), col("headers")[7]["value"].cast("string").alias("transactionid"), col("headers")[8]["value"].cast("string").alias("correlationid"), col("headers")[9]["value"].cast("string").alias("datacontenttype"), from_avro(expr("substring(value, 6)"), str(latest_schema)).alias("payload")) \ .withColumn("date", to_date(col("time"))) \ .withWatermark("time", "2 minutes") \ .dropDuplicates(subset=['id']) \ .writeStream \ .partitionBy("date") \ .format("parquet") \ .outputMode("append") \ .option("path","D:\\s3\\bkt-staging-data") \ .option("checkpointLocation", "D:\\s3\\bkt-checkpoint-data\\capturar-eventos-job") \ .trigger(once=True) \ .start() \ .awaitTermination() # .format("console") \ # .outputMode("update") \
fromAvroOptions = {"mode": "PERMISSIVE"} AvroDF = (spark.readStream.format("kafka").option( "kafka.bootstrap.servers", confluentBootstrapServers ).option("kafka.security.protocol", "SASL_SSL").option( "kafka.sasl.jaas.config", "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';" .format(confluentApiKey, confluentSecret)).option( "kafka.ssl.endpoint.identification.algorithm", "https").option( "kafka.sasl.mechanism", "PLAIN").option("subscribe", confluentTopicName).option( "startingOffsets", "earliest").load().withColumn( 'fixedKey', fn.expr("substring(key, 6, length(key)-5)")). withColumn('fixedValue', fn.expr("substring(value, 6, length(value)-5)")).select( from_avro('fixedKey', confluentKeySchema, fromAvroOptions).alias('parsedKey'), from_avro('fixedValue', confluentValueSchema, fromAvroOptions).alias('parsedValue'))) # COMMAND ---------- display(AvroDF) # COMMAND ---------- # Create a DataFrame that blows out the parsedValue into the three Skechers columns AvroDFCurated = AvroDF.select( "parsedValue.SOSAresultTime", "parsedValue.SOSASensors.UAID", "parsedValue.SOSASensors.SOSAhasResult.numericValue") # COMMAND ----------
{"name":"mta_tax","type":["double","null"]}, {"name":"tip_amount","type":["double","null"]}, {"name":"tolls_amount","type":["double","null"]}, {"name":"ehail_fee","type":["double","null"]}, {"name":"improvement_surcharge","type":["double","null"]}, {"name":"total_amount","type":["double","null"]}, {"name":"payment_type","type":["int","null"]}, {"name":"trip_type","type":["int","null"]}, {"name":"congestion_surcharge","type":["double","null"]} ] } """ green_ehub_df = (spark.readStream.format("eventhubs").options( **eh_stream_conf).load().withColumn( "body", from_avro(col("body"), green_avro_schema)).select("body.*")) # COMMAND ---------- # MAGIC %md # MAGIC ## EventHubs stream into TripData Bronze Delta sink # COMMAND ---------- (green_ehub_df.withColumn("color", lit("green")).withColumnRenamed( "lpep_pickup_datetime", "pep_pickup_datetime").withColumnRenamed( "lpep_dropoff_datetime", "pep_dropoff_datetime").writeStream.format("delta"). option("checkpointLocation", f"abfss://lake@{lake_name}/bronze/taxidemo/tripdata/green.checkpoint"). trigger(
#stock_schema = avro.schema.parse(open("stock.avsc", "rb").read()) stock_schema = open("stock.avsc", "r").read() #stock_struct = StructType.fromJson(stock_schema.to_json()) # Connect to Spark # sc = SparkContext(master = 'local[*]', appName = 'test') from initspark import initspark sc, spark, config = initspark() df: DataFrame = (spark.readStream.format("kafka").option( "kafka.bootstrap.servers", brokers).option("subscribe", kafka_topic).option("startingOffsets", "earliest").load()) # extract the binary value of the message and convert it to the schema read from the avsc file df1 = df.withColumn('value', from_avro("value", stock_schema)) # flatten out the value struct and remove it df2 = df1.select(*df.columns, col("value.*")).drop("value") df3 = df2.selectExpr("key as kafka_key", "timestamp as kafka_timestamp", "event_time", "symbol", "price") # df2.createOrReplaceTempView('trades') # df3 = spark.sql(""" # SELECT key as kafka_key, timestamp as kafka_timestamp, event_time, symbol, price # FROM trades # """) mysql_url = "jdbc:mysql://localhost:3306/stocks" # mysql_table mysql_login = {"user": "******", "password": "******"}
#Read from karka-avro source kafka_source_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "invoice-items") \ .option("startingOffsets", "earliest") \ .load() #Open avro schema avroSchema = open('schema/invoice-items', mode='r').read() #Deserialization using avro schema and from_avro function value_df = kafka_source_df.select( from_avro(col("value"), avroSchema).alias("value")) #Checking schema value_df.printSchema() #Choosing prime customers and calculate total transactions and earned points rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \ .groupBy("value.CustomerCardNo") \ .agg(sum("value.TotalValue").alias("TotalPurchase"), sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards")) #Rename column rewards_df = rewards_df.withColumn("CustomerCardNo", expr("`value.CustomerCardNo`")) \ .drop("value.CustomerCardNo") #Serilization to json format
print("Printing Schema of orders_df: ") orders_df.printSchema() orders_df1 = orders_df.select("value", "timestamp") # Define a schema for the orders data # order_id,order_product_name,order_card_type,order_amount,order_datetime,order_country_name,order_city_name,order_ecommerce_website_name #orders_schema_avro = open('orders.avsc', mode='r').read() orders_schema_avro = open( '/home/datamaking/PycharmProjects/pyspark_structured_streaming-main/part4.3/kafka_streaming_avro/orders.avsc', mode='r').read() # 8,Wrist Band,MasterCard,137.13,2020-10-21 18:37:02,United Kingdom,London,www.datamaking.com orders_df2 = orders_df1\ .select(from_avro(col("value"), orders_schema_avro)\ .alias("orders"), "timestamp") orders_df3 = orders_df2.select("orders.*", "timestamp") orders_df3.printSchema() # Simple aggregate - find total_order_amount by grouping country, city orders_df4 = orders_df3.groupBy("order_country_name", "order_city_name") \ .agg({'order_amount': 'sum'}) \ .select("order_country_name", "order_city_name", col("sum(order_amount)") \ .alias("total_order_amount")) print("Printing Schema of orders_df4: ") orders_df4.printSchema() # Write final result into console for debugging purpose orders_agg_write_stream = orders_df4 \
""" # COMMAND ---------- eh_green_kafka_df = green_2019_df.select( to_avro(struct(col("lpep_pickup_datetime"))).alias("key"), to_avro(struct(col("*"))).alias("value") ) # COMMAND ---------- display(eh_green_kafka_df) # COMMAND ---------- test_read_kafka_df = eh_green_kafka_df.select(from_avro(col("value"),green_avro_schema).alias("value")) display(test_read_kafka_df) # COMMAND ---------- ( eh_green_kafka_df .write .format("kafka") .option("topic", kafka_topic) .option("kafka.bootstrap.servers", kafka_bootstrap_servers) .option("kafka.sasl.mechanism", "PLAIN") .option("kafka.security.protocol", "SASL_SSL") .option("kafka.sasl.jaas.config", kafka_sasl_jaas_config) .option("kafka.session.timeout.ms", "60000") .option("kafka.request.timeout.ms", "30000")