Пример #1
0
    def _read_stream_kafka(self, stream, message_format, schema,
                           include_metadata):
        kafka_cols = [
            col("key"),
            col("topic"),
            col("partition"),
            col("offset"),
            col("timestamp"),
            col("timestampType"),
        ]

        if message_format == "avro" and schema is not None:
            # check if vallid avro schema
            avro.schema.parse(schema)
            df = stream.load()
            if include_metadata is True:
                return df.select(*kafka_cols,
                                 from_avro(df.value,
                                           schema).alias("value")).select(
                                               *kafka_cols, col("value.*"))
            return df.select(from_avro(df.value,
                                       schema).alias("value")).select(
                                           col("value.*"))
        elif message_format == "json" and schema is not None:
            df = stream.load()
            if include_metadata is True:
                return df.select(
                    *kafka_cols,
                    from_json(df.value.cast("string"),
                              schema).alias("value")).select(
                                  *kafka_cols, col("value.*"))
            return df.select(
                from_json(df.value.cast("string"),
                          schema).alias("value")).select(col("value.*"))

        if include_metadata is True:
            return stream.load()
        return stream.load().select("key", "value")
       {"name":"fare_amount","type":["double","null"]},
       {"name":"extra","type":["double","null"]},
       {"name":"mta_tax","type":["double","null"]},
       {"name":"tip_amount","type":["double","null"]},
       {"name":"tolls_amount","type":["double","null"]},
       {"name":"ehail_fee","type":["double","null"]},
       {"name":"improvement_surcharge","type":["double","null"]},
       {"name":"total_amount","type":["double","null"]},
       {"name":"payment_type","type":["int","null"]},
       {"name":"trip_type","type":["int","null"]},
       {"name":"congestion_surcharge","type":["double","null"]}
     ]
}
"""
test_read_avro_df = eh_green_avro_df.select(
    from_avro(col("body"), green_avro_schema).alias("body"))
display(test_read_avro_df)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Spark EventHubs Connector PySpark doc
# MAGIC https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/PySpark/structured-streaming-pyspark.md

# COMMAND ----------

import datetime

eh_write_conf = {
    'eventhubs.connectionString': eh_connection_encrypted,
    'eventhubs.operationTimeout':
green_ehub_df = (spark.readStream.format("kafka").option(
    "subscribe", kafka_topic).option(
        "kafka.bootstrap.servers", kafka_bootstrap_servers).option(
            "kafka.sasl.mechanism",
            "PLAIN").option("kafka.security.protocol", "SASL_SSL").option(
                "kafka.sasl.jaas.config", kafka_sasl_jaas_config).option(
                    "kafka.session.timeout.ms", "60000").option(
                        "kafka.request.timeout.ms",
                        "30000").option("kafka.group.id", "$Default").option(
                            "failOnDataLoss", "false").option(
                                "startingOffsets",
                                "earliest").load().withColumn(
                                    "value",
                                    from_avro(
                                        col("value"),
                                        green_avro_schema)).select("value.*"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## EventHubs stream into TripData Bronze Delta sink

# COMMAND ----------

(green_ehub_df.withColumn("color", lit("green")).withColumnRenamed(
    "lpep_pickup_datetime", "pep_pickup_datetime").withColumnRenamed(
        "lpep_dropoff_datetime",
        "pep_dropoff_datetime").writeStream.format("delta").
 option("checkpointLocation",
        f"abfss://lake@{lake_name}/bronze/taxidemo/tripdata/green.checkpoint").
Пример #4
0
        .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-avro_2.12:3.0.1") \
        .getOrCreate()

    logger = Log4j(spark)

    kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "invoice-items") \
        .option("startingOffsets", "earliest") \
        .load()

    avroSchema = open('schema/invoice-items', mode='r').read()

    value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value"))

    rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \
        .groupBy("value.CustomerCardNo") \
        .agg(sum("value.TotalValue").alias("TotalPurchase"),
             sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards"))

    kafka_target_df = rewards_df.select(expr("CustomerCardNo as key"),
                                        to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value"))

    # kafka_target_df.show(truncate=False)

    rewards_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Rewards Writer") \
        .format("kafka") \
    .option("includeHeaders", "true") \
    .load() \
    .select(col("topic"),
            col("partition"),
            col("offset"),
            find_header_value(col("headers"), lit("specversion")).cast("string").alias("specversion"),
            col("headers")[1]["value"].cast("string").alias("type"),
            col("headers")[2]["value"].cast("string").alias("source"),
            col("headers")[3]["value"].cast("string").alias("id"),
            col("headers")[4]["value"].cast("string").cast("timestamp").alias("time"),
            col("headers")[5]["value"].cast("string").alias("messageversion"),
            col("headers")[6]["value"].cast("string").alias("eventversion"),
            col("headers")[7]["value"].cast("string").alias("transactionid"),
            col("headers")[8]["value"].cast("string").alias("correlationid"),
            col("headers")[9]["value"].cast("string").alias("datacontenttype"),
            from_avro(expr("substring(value, 6)"), str(latest_schema)).alias("payload")) \
    .withColumn("date", to_date(col("time"))) \
    .withWatermark("time", "2 minutes") \
    .dropDuplicates(subset=['id']) \
    .writeStream \
    .partitionBy("date") \
    .format("parquet") \
    .outputMode("append") \
    .option("path","D:\\s3\\bkt-staging-data") \
    .option("checkpointLocation", "D:\\s3\\bkt-checkpoint-data\\capturar-eventos-job") \
    .trigger(once=True) \
    .start() \
    .awaitTermination()

# .format("console") \
# .outputMode("update") \
Пример #6
0
fromAvroOptions = {"mode": "PERMISSIVE"}

AvroDF = (spark.readStream.format("kafka").option(
    "kafka.bootstrap.servers", confluentBootstrapServers
).option("kafka.security.protocol", "SASL_SSL").option(
    "kafka.sasl.jaas.config",
    "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';"
    .format(confluentApiKey, confluentSecret)).option(
        "kafka.ssl.endpoint.identification.algorithm", "https").option(
            "kafka.sasl.mechanism",
            "PLAIN").option("subscribe", confluentTopicName).option(
                "startingOffsets", "earliest").load().withColumn(
                    'fixedKey', fn.expr("substring(key, 6, length(key)-5)")).
          withColumn('fixedValue',
                     fn.expr("substring(value, 6, length(value)-5)")).select(
                         from_avro('fixedKey', confluentKeySchema,
                                   fromAvroOptions).alias('parsedKey'),
                         from_avro('fixedValue', confluentValueSchema,
                                   fromAvroOptions).alias('parsedValue')))

# COMMAND ----------

display(AvroDF)

# COMMAND ----------

# Create a DataFrame that blows out the parsedValue into the three Skechers columns
AvroDFCurated = AvroDF.select(
    "parsedValue.SOSAresultTime", "parsedValue.SOSASensors.UAID",
    "parsedValue.SOSASensors.SOSAhasResult.numericValue")

# COMMAND ----------
       {"name":"mta_tax","type":["double","null"]},
       {"name":"tip_amount","type":["double","null"]},
       {"name":"tolls_amount","type":["double","null"]},
       {"name":"ehail_fee","type":["double","null"]},
       {"name":"improvement_surcharge","type":["double","null"]},
       {"name":"total_amount","type":["double","null"]},
       {"name":"payment_type","type":["int","null"]},
       {"name":"trip_type","type":["int","null"]},
       {"name":"congestion_surcharge","type":["double","null"]}
     ]
}
"""

green_ehub_df = (spark.readStream.format("eventhubs").options(
    **eh_stream_conf).load().withColumn(
        "body", from_avro(col("body"), green_avro_schema)).select("body.*"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## EventHubs stream into TripData Bronze Delta sink

# COMMAND ----------

(green_ehub_df.withColumn("color", lit("green")).withColumnRenamed(
    "lpep_pickup_datetime", "pep_pickup_datetime").withColumnRenamed(
        "lpep_dropoff_datetime",
        "pep_dropoff_datetime").writeStream.format("delta").
 option("checkpointLocation",
        f"abfss://lake@{lake_name}/bronze/taxidemo/tripdata/green.checkpoint").
 trigger(
Пример #8
0
#stock_schema = avro.schema.parse(open("stock.avsc", "rb").read())
stock_schema = open("stock.avsc", "r").read()

#stock_struct = StructType.fromJson(stock_schema.to_json())
# Connect to Spark
# sc = SparkContext(master = 'local[*]', appName = 'test')
from initspark import initspark
sc, spark, config = initspark()

df: DataFrame = (spark.readStream.format("kafka").option(
    "kafka.bootstrap.servers",
    brokers).option("subscribe", kafka_topic).option("startingOffsets",
                                                     "earliest").load())

# extract the binary value of the message and convert it to the schema read from the avsc file
df1 = df.withColumn('value', from_avro("value", stock_schema))
# flatten out the value struct and remove it
df2 = df1.select(*df.columns, col("value.*")).drop("value")

df3 = df2.selectExpr("key as kafka_key", "timestamp as kafka_timestamp",
                     "event_time", "symbol", "price")

# df2.createOrReplaceTempView('trades')
# df3 = spark.sql("""
# SELECT key as kafka_key, timestamp as kafka_timestamp, event_time, symbol, price
# FROM trades
# """)

mysql_url = "jdbc:mysql://localhost:3306/stocks"
# mysql_table
mysql_login = {"user": "******", "password": "******"}
    #Read from karka-avro source
    kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "invoice-items") \
        .option("startingOffsets", "earliest") \
        .load()

    #Open avro schema
    avroSchema = open('schema/invoice-items', mode='r').read()

    #Deserialization using avro schema and from_avro function
    value_df = kafka_source_df.select(
        from_avro(col("value"), avroSchema).alias("value"))

    #Checking schema
    value_df.printSchema()

    #Choosing prime customers and calculate total transactions and earned points
    rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \
        .groupBy("value.CustomerCardNo") \
        .agg(sum("value.TotalValue").alias("TotalPurchase"),
             sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards"))

    #Rename column
    rewards_df = rewards_df.withColumn("CustomerCardNo", expr("`value.CustomerCardNo`")) \
                           .drop("value.CustomerCardNo")

    #Serilization to json format
Пример #10
0
    print("Printing Schema of orders_df: ")
    orders_df.printSchema()

    orders_df1 = orders_df.select("value", "timestamp")

    # Define a schema for the orders data
    # order_id,order_product_name,order_card_type,order_amount,order_datetime,order_country_name,order_city_name,order_ecommerce_website_name
    #orders_schema_avro = open('orders.avsc', mode='r').read()
    orders_schema_avro = open(
        '/home/datamaking/PycharmProjects/pyspark_structured_streaming-main/part4.3/kafka_streaming_avro/orders.avsc',
        mode='r').read()

    # 8,Wrist Band,MasterCard,137.13,2020-10-21 18:37:02,United Kingdom,London,www.datamaking.com
    orders_df2 = orders_df1\
        .select(from_avro(col("value"), orders_schema_avro)\
        .alias("orders"), "timestamp")

    orders_df3 = orders_df2.select("orders.*", "timestamp")
    orders_df3.printSchema()

    # Simple aggregate - find total_order_amount by grouping country, city
    orders_df4 = orders_df3.groupBy("order_country_name", "order_city_name") \
        .agg({'order_amount': 'sum'}) \
        .select("order_country_name", "order_city_name", col("sum(order_amount)") \
        .alias("total_order_amount"))

    print("Printing Schema of orders_df4: ")
    orders_df4.printSchema()

    # Write final result into console for debugging purpose
    orders_agg_write_stream = orders_df4 \
Пример #11
0
"""

# COMMAND ----------

eh_green_kafka_df = green_2019_df.select(
  to_avro(struct(col("lpep_pickup_datetime"))).alias("key"), 
  to_avro(struct(col("*"))).alias("value")
)

# COMMAND ----------

display(eh_green_kafka_df)

# COMMAND ----------

test_read_kafka_df = eh_green_kafka_df.select(from_avro(col("value"),green_avro_schema).alias("value"))
display(test_read_kafka_df)

# COMMAND ----------

 (
   eh_green_kafka_df
    .write
    .format("kafka") 
    .option("topic", kafka_topic)
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) 
    .option("kafka.sasl.mechanism", "PLAIN") 
    .option("kafka.security.protocol", "SASL_SSL") 
    .option("kafka.sasl.jaas.config", kafka_sasl_jaas_config) 
    .option("kafka.session.timeout.ms", "60000") 
    .option("kafka.request.timeout.ms", "30000")