df_json.select("First_Name").show() df_json.select(df_json["First_Name"], df_json["dob"], ((df_json["dob"] - 1).cast( IntegerType())).alias("changed_dob")).show() #+----------+----+-----------+ #|First_Name| dob|changed_dob| #+----------+----+-----------+ #| DAVID|2013| 2012| #| JAYDEN|2013| 2012| #| RUBY|2014| 2013| #| MOSHE|2012| 2011| #| ETHAN|2015| 2014| #| EDDIE|2012| 2011| #| RACHEL|2014| 2013| #| ELENA|2014| 2013| #| MIGUEL|2013| 2012| #| ROSY|2015| 2014| #+----------+----+-----------+ #filter df_json.filter(df_json["FATHER_INCOME"] > 3750).show() #group by and count df_json.groupBy("DOB").count().show() #global temp view df_json.createOrReplaceGlobalTempView("baby_gv") #sql operation on global temp view spark.sql("select race from global_temp.baby_gv").show() #using from new session spark.newSession().sql("select state from global_temp.baby_gv").show()
.option("kafka.bootstrap.servers", "192.168.1.100:9092")\ .option("subscribe", "json_topic")\ .option("startingOffsets", "earliest")\ .load() df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value")\ .writeStream\ .format("kafka")\ .outputMode("append") \ .option("kafka.bootstrap.servers", "192.168.1.100:9092")\ .option("topic", "josn_data_topic")\ .start()\ .awaitTermination() #newSession() spark_session = SparkSession.newSession()