def read_input(self): config = self.config spark = self.spark kafka_server = config["kafka_server"] kafka_topic = config["kafka_topic"] schema_input = config["schema_input"] startInput = (spark .readStream .format("kafka") .option("kafka.bootstrap.servers", kafka_server) .option("subscribe", kafka_topic) .option("startingOffsets", "earliest") .load() .selectExpr("CAST(value AS STRING) as value") ) inputAnomalySchema = schema_of_json(F.lit(json.dumps(schema_input))) return ( startInput.withColumn("data", from_json("value", inputAnomalySchema)) .select('data.*') .withColumn("timestamp", to_timestamp("@timestamp")) .withColumn( "@timestamp", date_format('timestamp', constants.TIMESTAMP_FORMAT) ) )
def get_schema_from_json(col_name): # get the json string and remove the header values json_typed = json.loads(pivotted_df.select(col_name).first()[0]) del json_typed['header'] # create the schema return schema_of_json(json.dumps(json_typed))
kafka_to_delta.py \ >log/kafka_to_delta.log 2>&1 & """ spark = get_spark() # 读取kafka数据 kafka_reader = spark.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \ .option("subscribe", topic) \ .load() # 根据json格式的数据获取数据的schema schema = f.schema_of_json( """{"userId":"44303","movieId":"3338","rating":"3.5","timestamp":"2020-05-06T09:40:14.603+08:00"}""" ) rating_df = kafka_reader \ .withColumn("value", col("value").cast("string")) \ .withColumn("value", f.from_json("value", schema)) \ .selectExpr("value.movieId", "cast(value.rating as float)", "value.userId", "cast(value.timestamp as timestamp)", "timestamp as kafka_timestamp", "to_date(value.timestamp) as dt" ) query=rating_df.writeStream\
# COMMAND ---------- df = spark.readStream.format('delta').load(untappd_raw_delta_path) # COMMAND ---------- # MAGIC %md # MAGIC ##### extract venues # COMMAND ---------- from pyspark.sql.functions import col, json_tuple, from_json, schema_of_json schema = schema_of_json( '''{"venue_id":9917985,"venue_name":"Untappd at Home","venue_slug":"untappd-at-home","primary_category_key":"Residence","primary_category":"Residence","parent_category_id":"4e67e38e036454776db1fb3a","categories":{"count":1,"items":[{"category_key":"home_private","category_name":"Home (private)","category_id":"4bf58dd8d48988d103941735","is_primary":true}]},"location":{"venue_address":"","venue_city":"","venue_state":"Everywhere","venue_country":"United States","lat":34.2347,"lng":-77.9482},"contact":{"twitter":"","venue_url":""},"foursquare":{"foursquare_id":"5e7b4d99c91df60008e8b168","foursquare_url":"https://4sq.com/3bDWYuq"},"venue_icon":{"sm":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_64.png","md":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_88.png","lg":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_176.png?v=1"},"is_verified":true}''' ) df = df.withColumn("venue", from_json(df.venue, schema)) # COMMAND ---------- # MAGIC %md # MAGIC ### Badges # COMMAND ---------- from pyspark.sql.functions import explode df_badges = df.select(df.checkin_id, df.badges.count.alias('badge_count'), df.badges.retro_status.alias('retro'), explode(df.badges.items).alias('items'))