Exemplo n.º 1
0
 def _online_fg_to_avro(self, feature_group, dataframe):
     """Packs all features into named struct to be serialized to single avro/binary
     column. And packs primary key into arry to be serialized for partitioning.
     """
     return dataframe.select([
         # be aware: primary_key array should always be sorted
         to_avro(
             concat(*[
                 col(f).cast("string")
                 for f in sorted(feature_group.primary_key)
             ])).alias("key"),
         to_avro(
             struct([
                 field["name"] for field in json.loads(
                     feature_group.avro_schema)["fields"]
             ]),
             feature_group._get_encoded_avro_schema(),
         ).alias("value"),
     ])
Exemplo n.º 2
0
 def _encode_complex_features(self, feature_group, dataframe):
     """Encodes all complex type features to binary using their avro type as schema."""
     return dataframe.select([
         field["name"] if field["name"]
         not in feature_group.get_complex_features() else to_avro(
             field["name"],
             feature_group._get_feature_avro_schema(field["name"])).alias(
                 field["name"])
         for field in json.loads(feature_group.avro_schema)["fields"]
     ])
    "header",
    True).load("dbfs:/databricks-datasets/nyctaxi/tripdata/green/*2019*.csv.gz"
               ).orderBy(col("lpep_pickup_datetime").asc()))

# COMMAND ----------

green_2019_df.count()

# COMMAND ----------

display(green_2019_df)

# COMMAND ----------

eh_green_avro_df = green_2019_df.select(
    to_avro(struct(col("*"))).alias("body"))

# COMMAND ----------

display(eh_green_avro_df)

# COMMAND ----------

green_avro_schema = """
 {
   "type":"record",
   "name":"topLevelRecord",
   "fields":
     [
       {"name":"VendorID","type":["int","null"]},
       {"name":"lpep_pickup_datetime","type":["string","null"]},
    explode_df = value_df.selectExpr("value.InvoiceNumber", "value.CreatedTime", "value.StoreID",
                                     "value.PosID", "value.CustomerType", "value.CustomerCardNo", "value.DeliveryType",
                                     "value.DeliveryAddress.City",
                                     "value.DeliveryAddress.State", "value.DeliveryAddress.PinCode",
                                     "explode(value.InvoiceLineItems) as LineItem")

    flattened_df = explode_df \
        .withColumn("ItemCode", expr("LineItem.ItemCode")) \
        .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
        .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
        .withColumn("ItemQty", expr("LineItem.ItemQty")) \
        .withColumn("TotalValue", expr("LineItem.TotalValue")) \
        .drop("LineItem")

    kafka_target_df = flattened_df.select(expr("InvoiceNumber as key"),
                                          to_avro(struct("*")).alias("value"))

    invoice_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Flattened Invoice Writer") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("topic", "invoice-items") \
        .outputMode("append") \
        .option("checkpointLocation", "chk-point-dir") \
        .start()

    logger.info("Start Writer Query")
    invoice_writer_query.awaitTermination()
Exemplo n.º 5
0
# flatten the struct to a normal DataFrame
df3 = df2.select(*(df2.columns), col("value2.*")).drop('value2')
print('df3', df3)

# df3 = df2.withColumn('value', to_avro("value", stock_schema))
# print('df2', df2)

df3.createOrReplaceTempView("data")
df4 = spark.sql("""
SELECT key, cast(NAMED_STRUCT('event_time', event_time, 'symbol', symbol, 'price', price, 'quantity', quantity) as string) AS value 
FROM data
""")
print('df4', df4)

df5 = df4.selectExpr("key", to_avro("value", stock_schema))
print('df5', df5)

# query = (df1.writeStream
#     .outputMode("append")
#     .format("console")
#     .option("truncate", False)
#     .start()
#     )

# query.awaitTermination()

query = (df5.writeStream.format("kafka").option(
    "kafka.bootstrap.servers",
    brokers).option("topic", "stocks-avro").option("checkpointLocation",
                                                   "/tmp").start())
sqlContext = SQLContext(spark.sparkContext)

spark.sparkContext.setLogLevel('WARN')

staging_data = spark \
    .readStream \
    .format("parquet") \
    .schema(spark.read.parquet("D:\\s3\\bkt-staging-data").schema) \
    .option("path", "D:\\s3\\bkt-staging-data") \
    .load()

staging_data.createOrReplaceTempView("evento")

aggregated_data = sqlContext.sql("SELECT payload.data.codigo_produto_operacional, COUNT(*) as quantidade_eventos_transmitidos, COUNT(case when payload.data.codigo_empresa = 341 then 1 else null end) as quantidade_eventos_transmitidos_sucesso, COUNT(case when payload.data.codigo_empresa = 350 then 1 else null end) as quantidade_eventos_transmitidos_erro FROM evento GROUP BY payload.data.codigo_produto_operacional") \
    .withColumn("data", struct("*")) \
    .withColumn("value", concat(lit(magic_byte), lit(id_bytes), to_avro(struct("data"), str(latest_schema)))) \
    .withColumn("headers ",
        array(
            struct(lit("specversion").alias("key"), lit("1").cast("binary").alias("value")),
            struct(lit("type").alias("key"), lit("").cast("binary").alias("value")),
            struct(lit("source").alias("key"), lit("urn:sigla:gerar-relatorio-transmissao-job").cast("binary").alias("value")),
            struct(lit("id").alias("key"), expr("uuid()").cast("binary").alias("value")),
            struct(lit("time").alias("key"), date_format(current_timestamp(), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'").cast("binary").alias("value")),
            struct(lit("messageversion").alias("key"), lit("1").cast("binary").alias("value")),
            struct(lit("transactionid").alias("key"), lit("").cast("binary").alias("value")),
            struct(lit("correlationid").alias("key"), lit("").cast("binary").alias("value")),
            struct(lit("datacontenttype").alias("key"), lit("application/avro").cast("binary").alias("value"))
        )
    ) \
    .writeStream \
    .format("console") \
Exemplo n.º 7
0
       {"name":"tip_amount","type":["double","null"]},
       {"name":"tolls_amount","type":["double","null"]},
       {"name":"ehail_fee","type":["double","null"]},
       {"name":"improvement_surcharge","type":["double","null"]},
       {"name":"total_amount","type":["double","null"]},
       {"name":"payment_type","type":["int","null"]},
       {"name":"trip_type","type":["int","null"]},
       {"name":"congestion_surcharge","type":["double","null"]}
     ]
}
"""

# COMMAND ----------

eh_green_kafka_df = green_2019_df.select(
  to_avro(struct(col("lpep_pickup_datetime"))).alias("key"), 
  to_avro(struct(col("*"))).alias("value")
)

# COMMAND ----------

display(eh_green_kafka_df)

# COMMAND ----------

test_read_kafka_df = eh_green_kafka_df.select(from_avro(col("value"),green_avro_schema).alias("value"))
display(test_read_kafka_df)

# COMMAND ----------

 (