def test_explode(self): from pyspark.sql.functions import explode, explode_outer, posexplode_outer d = [ Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), Row(a=1, intlist=[], mapfield={}), Row(a=1, intlist=None, mapfield=None), ] rdd = self.sc.parallelize(d) data = self.spark.createDataFrame(rdd) result = data.select(explode(data.intlist).alias("a")).select("a").collect() self.assertEqual(result[0][0], 1) self.assertEqual(result[1][0], 2) self.assertEqual(result[2][0], 3) result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() self.assertEqual(result[0][0], "a") self.assertEqual(result[0][1], "b") result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()] self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)]) result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()] self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)]) result = [x[0] for x in data.select(explode_outer("intlist")).collect()] self.assertEqual(result, [1, 2, 3, None, None]) result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()] self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
# databaseDF.show(truncate=False) # Languages changes databaseDF.write.mode("overwrite").option("header", "true") \ .csv("s3a://nice-uat-data-warehouse/stackoverflow/database/") languageDF = initialDF.select( ["Respondent", "LanguageWorkedWith", "LanguageDesireNextYear"]) languageDF = languageDF.withColumn("LanguageWorkedWith", f.split("LanguageWorkedWith", ";")) \ .withColumn("LanguageDesireNextYear", f.split("LanguageDesireNextYear", ";")) languageDF = languageDF.select( "*", f.posexplode_outer("LanguageWorkedWith").alias("pos", "LanguageWorkedWith2")) languageDF = languageDF.withColumn("LanguageDesireNextYear", f.expr("LanguageDesireNextYear[pos]")) languageDF = languageDF.selectExpr( "Respondent", "LanguageDesireNextYear", "LanguageWorkedWith2 as LanguageWorkedWith") languageDF.write.mode("overwrite").option("header", "true") \ .csv("s3a://nice-uat-data-warehouse/stackoverflow/language/") languageDF.show(n=30, truncate=False) session.stop()
df2 = df.select(df.name,explode(df.knownLanguages)) df2.printSchema() df2.show() from pyspark.sql.functions import explode df3 = df.select(df.name,explode(df.properties)) df3.printSchema() df3.show() from pyspark.sql.functions import explode_outer """ with array """ df.select(df.name,explode_outer(df.knownLanguages)).show() """ with map """ df.select(df.name,explode_outer(df.properties)).show() from pyspark.sql.functions import posexplode """ with array """ df.select(df.name,posexplode(df.knownLanguages)).show() """ with map """ df.select(df.name,posexplode(df.properties)).show() from pyspark.sql.functions import posexplode_outer """ with array """ df.select("name",posexplode_outer("knownLanguages")).show() """ with map """ df.select(df.name,posexplode_outer(df.properties)).show() """END"""
df2 = df.select(df.name,explode(df.knownLanguages)) df2.printSchema() df2.show() from pyspark.sql.functions import explode df3 = df.select(df.name,explode(df.properties)) df3.printSchema() df3.show() from pyspark.sql.functions import explode_outer """ with array """ df.select(df.name,explode_outer(df.knownLanguages)).show() """ with map """ df.select(df.name,explode_outer(df.properties)).show() from pyspark.sql.functions import posexplode """ with array """ df.select(df.name,posexplode(df.knownLanguages)).show() """ with map """ df.select(df.name,posexplode(df.properties)).show() from pyspark.sql.functions import posexplode_outer """ with array """ df.select($"name",posexplode_outer($"knownLanguages")).show() """ with map """ df.select(df.name,posexplode_outer(df.properties)).show() """END"""