def _online_fg_to_avro(self, feature_group, dataframe): """Packs all features into named struct to be serialized to single avro/binary column. And packs primary key into arry to be serialized for partitioning. """ return dataframe.select([ # be aware: primary_key array should always be sorted to_avro( concat(*[ col(f).cast("string") for f in sorted(feature_group.primary_key) ])).alias("key"), to_avro( struct([ field["name"] for field in json.loads( feature_group.avro_schema)["fields"] ]), feature_group._get_encoded_avro_schema(), ).alias("value"), ])
def _encode_complex_features(self, feature_group, dataframe): """Encodes all complex type features to binary using their avro type as schema.""" return dataframe.select([ field["name"] if field["name"] not in feature_group.get_complex_features() else to_avro( field["name"], feature_group._get_feature_avro_schema(field["name"])).alias( field["name"]) for field in json.loads(feature_group.avro_schema)["fields"] ])
"header", True).load("dbfs:/databricks-datasets/nyctaxi/tripdata/green/*2019*.csv.gz" ).orderBy(col("lpep_pickup_datetime").asc())) # COMMAND ---------- green_2019_df.count() # COMMAND ---------- display(green_2019_df) # COMMAND ---------- eh_green_avro_df = green_2019_df.select( to_avro(struct(col("*"))).alias("body")) # COMMAND ---------- display(eh_green_avro_df) # COMMAND ---------- green_avro_schema = """ { "type":"record", "name":"topLevelRecord", "fields": [ {"name":"VendorID","type":["int","null"]}, {"name":"lpep_pickup_datetime","type":["string","null"]},
explode_df = value_df.selectExpr("value.InvoiceNumber", "value.CreatedTime", "value.StoreID", "value.PosID", "value.CustomerType", "value.CustomerCardNo", "value.DeliveryType", "value.DeliveryAddress.City", "value.DeliveryAddress.State", "value.DeliveryAddress.PinCode", "explode(value.InvoiceLineItems) as LineItem") flattened_df = explode_df \ .withColumn("ItemCode", expr("LineItem.ItemCode")) \ .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \ .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \ .withColumn("ItemQty", expr("LineItem.ItemQty")) \ .withColumn("TotalValue", expr("LineItem.TotalValue")) \ .drop("LineItem") kafka_target_df = flattened_df.select(expr("InvoiceNumber as key"), to_avro(struct("*")).alias("value")) invoice_writer_query = kafka_target_df \ .writeStream \ .queryName("Flattened Invoice Writer") \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("topic", "invoice-items") \ .outputMode("append") \ .option("checkpointLocation", "chk-point-dir") \ .start() logger.info("Start Writer Query") invoice_writer_query.awaitTermination()
# flatten the struct to a normal DataFrame df3 = df2.select(*(df2.columns), col("value2.*")).drop('value2') print('df3', df3) # df3 = df2.withColumn('value', to_avro("value", stock_schema)) # print('df2', df2) df3.createOrReplaceTempView("data") df4 = spark.sql(""" SELECT key, cast(NAMED_STRUCT('event_time', event_time, 'symbol', symbol, 'price', price, 'quantity', quantity) as string) AS value FROM data """) print('df4', df4) df5 = df4.selectExpr("key", to_avro("value", stock_schema)) print('df5', df5) # query = (df1.writeStream # .outputMode("append") # .format("console") # .option("truncate", False) # .start() # ) # query.awaitTermination() query = (df5.writeStream.format("kafka").option( "kafka.bootstrap.servers", brokers).option("topic", "stocks-avro").option("checkpointLocation", "/tmp").start())
sqlContext = SQLContext(spark.sparkContext) spark.sparkContext.setLogLevel('WARN') staging_data = spark \ .readStream \ .format("parquet") \ .schema(spark.read.parquet("D:\\s3\\bkt-staging-data").schema) \ .option("path", "D:\\s3\\bkt-staging-data") \ .load() staging_data.createOrReplaceTempView("evento") aggregated_data = sqlContext.sql("SELECT payload.data.codigo_produto_operacional, COUNT(*) as quantidade_eventos_transmitidos, COUNT(case when payload.data.codigo_empresa = 341 then 1 else null end) as quantidade_eventos_transmitidos_sucesso, COUNT(case when payload.data.codigo_empresa = 350 then 1 else null end) as quantidade_eventos_transmitidos_erro FROM evento GROUP BY payload.data.codigo_produto_operacional") \ .withColumn("data", struct("*")) \ .withColumn("value", concat(lit(magic_byte), lit(id_bytes), to_avro(struct("data"), str(latest_schema)))) \ .withColumn("headers ", array( struct(lit("specversion").alias("key"), lit("1").cast("binary").alias("value")), struct(lit("type").alias("key"), lit("").cast("binary").alias("value")), struct(lit("source").alias("key"), lit("urn:sigla:gerar-relatorio-transmissao-job").cast("binary").alias("value")), struct(lit("id").alias("key"), expr("uuid()").cast("binary").alias("value")), struct(lit("time").alias("key"), date_format(current_timestamp(), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast("binary").alias("value")), struct(lit("messageversion").alias("key"), lit("1").cast("binary").alias("value")), struct(lit("transactionid").alias("key"), lit("").cast("binary").alias("value")), struct(lit("correlationid").alias("key"), lit("").cast("binary").alias("value")), struct(lit("datacontenttype").alias("key"), lit("application/avro").cast("binary").alias("value")) ) ) \ .writeStream \ .format("console") \
{"name":"tip_amount","type":["double","null"]}, {"name":"tolls_amount","type":["double","null"]}, {"name":"ehail_fee","type":["double","null"]}, {"name":"improvement_surcharge","type":["double","null"]}, {"name":"total_amount","type":["double","null"]}, {"name":"payment_type","type":["int","null"]}, {"name":"trip_type","type":["int","null"]}, {"name":"congestion_surcharge","type":["double","null"]} ] } """ # COMMAND ---------- eh_green_kafka_df = green_2019_df.select( to_avro(struct(col("lpep_pickup_datetime"))).alias("key"), to_avro(struct(col("*"))).alias("value") ) # COMMAND ---------- display(eh_green_kafka_df) # COMMAND ---------- test_read_kafka_df = eh_green_kafka_df.select(from_avro(col("value"),green_avro_schema).alias("value")) display(test_read_kafka_df) # COMMAND ---------- (