def test_map_to_columns(spark): data = [("jose", { "a": "aaa", "b": "bbb" }), ("li", { "b": "some_letter", "z": "zed" })] df = spark.createDataFrame(data, ["first_name", "some_data"]) # df.withColumn("some_data_a", F.col("some_data")["a"]).show() # df.show(truncate=False) # df.printSchema() df\ .withColumn("some_data_a", F.col("some_data").getItem("a"))\ .withColumn("some_data_b", F.col("some_data").getItem("b"))\ .withColumn("some_data_z", F.col("some_data").getItem("z"))\ # .show(truncate=False) cols = [F.col("first_name")] + list( map(lambda f: F.col("some_data").getItem(f).alias(str(f)), ["a", "b", "z"])) # df.select(cols).show() keys_df = df.select(F.explode(F.map_keys(F.col("some_data")))).distinct() # keys_df.show() keys = list(map(lambda row: row[0], keys_df.collect())) # print(keys) key_cols = list( map(lambda f: F.col("some_data").getItem(f).alias(str(f)), keys)) # print(key_cols) final_cols = [F.col("first_name")] + key_cols
def get_distinct_keys(df, col_name, is_col_arr_map=False): """Return list of distinct keys. Set is_col_arr_map to be true if column is an array of Maps. Otherwise, assume column is a Map. """ if is_col_arr_map: df = df.select(explode(col_name).alias(col_name)) df = df.select(explode(map_keys(col_name))) return df.distinct().rdd.flatMap(lambda x: x).collect()
def transform(self): """ transforms the data with the given `self.schema` to the format suitable for the SQL queries :return: Pyspark DF """ self.df = self.df\ .withColumn('items', F.explode('items')) \ .withColumn('ID', F.map_keys("items")[0]) \ .withColumn('qp', F.map_values('items')[0]) \ .withColumn('quantity', F.col('qp').getItem('quantity').astype('int')) \ .withColumn('price', F.col('qp').getItem('price')) \ .select(['user', 'timestamp', 'ID', 'quantity', 'price']) return self.df
def parquet_revalue(vcf, indel_com): temp = indel_com.join(vcf, ["#CHROM", "POS"], "full") sample_name = temp.columns[-1] sample_w = Window.partitionBy(F.col("#CHROM")).orderBy( F.col("POS")).rangeBetween(Window.unboundedPreceding, Window.currentRow) temp = temp.withColumn( sample_name, F.last(sample_name, ignorenulls=True).over(sample_w)).withColumnRenamed( "#CHROM", "CHROM") # scala UDF null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\ .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":"))) null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":"))) value_union = null_not_value.union(null_value).withColumnRenamed( "CHROM", "#CHROM") return value_union
#! python3 # -*- coding: utf-8 -*- from pyspark.sql import SparkSession from pyspark.sql import functions as F spark = SparkSession.builder.appName( "PySpark example").enableHiveSupport().getOrCreate() spark.sparkContext.setLogLevel("WARN") # Read data df = spark.sql( "SELECT img_label FROM sprs_log_basis.model_server_log WHERE datepart=20190425 LIMIT 10" ) df.cache() df.show() # Get keys df = df.select(F.map_keys("img_label").alias("keys")) # Assign index df = df.withColumn("doc_id", F.monotonically_increasing_id()) NUM_doc = df.count() # One hot words df = df.select('*', F.explode('keys').alias('token')) df.show() # Calculate TF TF = df.groupBy("doc_id").agg(F.count("token").alias("doc_len")) \ .join(df.groupBy("doc_id", "token") .agg(F.count("keys").alias("word_count")), ['doc_id']) \ .withColumn("tf", F.col("word_count") / F.col("doc_len")) \ .drop("doc_len", "word_count") TF.cache() # Calculate IDF IDF = df.groupBy("token").agg(F.countDistinct("doc_id").alias("df"))
schema = StructType([ StructField('name', StringType(), True), StructField('properties', MapType(StringType(), StringType()), True) ]) df2 = spark.createDataFrame(data=dataDictionary, schema=schema) df2.printSchema() df2.show(truncate=False) df3=df.rdd.map(lambda x: \ (x.name,x.properties["hair"],x.properties["eye"])) \ .toDF(["name","hair","eye"]) df3.printSchema() df3.show() df.withColumn("hair",df.properties.getItem("hair")) \ .withColumn("eye",df.properties.getItem("eye")) \ .drop("properties") \ .show() df.withColumn("hair",df.properties["hair"]) \ .withColumn("eye",df.properties["eye"]) \ .drop("properties") \ .show() # Functions from pyspark.sql.functions import explode, map_keys, col keysDF = df.select(explode(map_keys(df.properties))).distinct() keysList = keysDF.rdd.map(lambda x: x[0]).collect() keyCols = list( map(lambda x: col("properties").getItem(x).alias(str(x)), keysList)) df.select(df.name, *keyCols).show()
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ ### category data = product_data.withColumn( category_column, F.when(F.col(categories_column)[0][0] == '', None).otherwise(F.col(categories_column)[0][0])) category_nulls = data.filter(F.col(category_column).isNull()).count() category_distinct = data.agg(F.countDistinct( F.col(category_column))).head()[0] ### salesRank and salesCategory key_and_values = data.select( asin_column, category_column, F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column), F.map_values(salesRank_column)[0].alias(bestSalesRank_column)) mean_of_salesRank = key_and_values.select( F.avg(F.col(bestSalesRank_column))).head()[0] variance_of_salesRank = key_and_values.select( F.variance(F.col(bestSalesRank_column))).head()[0] salesCategory_nulls = key_and_values.filter( F.col(bestSalesCategory_column).isNull()).count() salesCategory_distinct = key_and_values.agg( F.countDistinct(F.col(bestSalesCategory_column))).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = data.count() res['mean_bestSalesRank'] = mean_of_salesRank res['variance_bestSalesRank'] = variance_of_salesRank res['numNulls_category'] = category_nulls res['countDistinct_category'] = category_distinct res['numNulls_bestSalesCategory'] = salesCategory_nulls res['countDistinct_bestSalesCategory'] = salesCategory_distinct # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res
def write_delta_lookup_table(s3_source_bucket: str, s3_source_bucket_raw_prefix: str, dest_delta_prefix: str, table_name: str, date_hour_path: str, pk: str): try: s3_connection = S3Connection(default_bucket=None) spark = get_spark_session() spark.conf.set("spark.sql.mapKeyDedupPolicy", 'LAST_WIN') spark_context = spark.sparkContext sql_context = spark.builder.getOrCreate() logger = prefect.context.get("logger") s3_location = f"s3a://{s3_source_bucket}/{s3_source_bucket_raw_prefix}" hour_data_path = f"{s3_location}/{date_hour_path}" file_list = s3_connection.read_s3_folder( s3_folder_path=f"{s3_source_bucket_raw_prefix}/{date_hour_path}", s3_bucket=s3_source_bucket) logger.info(f"file_list --> {file_list}") if len(file_list) <= 0: logger.warn(f"No transaction data in path - {hour_data_path}") else: """ lookup_table_df = spark_context.textFile(hour_data_path) lookup_json_df = sql_context.read.json(lookup_table_df) lookup_snapshot_df = lookup_json_df.select("op_type", "op_ts","after.*") logger.info(f"Delta_raw path - {s3_location}/{dest_delta_prefix}") lookup_snapshot_df.write.format("delta").mode("append").save(f"{s3_location}/{dest_delta_prefix}") sql_context.sql("CREATE DATABASE IF NOT EXISTS test;") sql_context.sql(f"CREATE TABLE {table_name}_raw USING DELTA LOCATION '{s3_location}/{dest_delta_prefix}';") logger.info(f"Delta write complete in path - {s3_location}/{dest_delta_prefix} . Table name - {table_name}") filter = sql_context.sql(f"select row_number() OVER (partition by LIN_OF_BUS_ID order by LIN_OF_BUS_ID, op_ts desc) as row_number, * from {table_name}_raw order by LIN_OF_BUS_ID, current_ts desc ") filter.where("row_number =1 and op_type <> 'D'").select("*").drop("op_ts","op_type").write.format("delta").mode("overwrite").save( f"{s3_location}/delta_snapshot/") """ lookup_table_df = spark_context.textFile(hour_data_path) cleansed_df = lookup_table_df.map(lambda s: s.replace( "{\"table", "[{\"table", 1)).map(lambda s: s.replace( "}{\"table", "},{\"table")).map(lambda s: s.replace( "}}", "}}]")).map(lambda s: s.replace("}}],", "}},")) lookup_json_df = sql_context.read.json(cleansed_df) update_df = lookup_json_df.where("op_type='U'").select("*") insert_df = lookup_json_df.where("op_type='I'").select("*") delete_df = lookup_json_df.where("op_type='D'").select("*") #Handling update transactions if (update_df.count() != 0): lookup_after = update_df.select("after.*") lookup_before = update_df.select("before.*") logger.info(f'op_ts - {list((lit("op_ts"),"op_ts"))}') op_ts = list((lit("op_ts"), "op_ts")) op_type = list((lit("op_type"), "op_type")) op_ts.extend(op_type) a = list( chain(*((lit(name), ("after." + name)) for name in lookup_after.columns))) a.extend(op_ts) b = list( chain(*((lit(name), ("before." + name)) for name in lookup_before.columns))) b.extend(op_ts) after = create_map(a).alias("after") before = create_map(b).alias("before") final = update_df.select( map_concat(before, after).alias("final")) keys = final.select(map_keys("final").alias("keys")).first() exprs = [ col("final").getItem(k).alias(k) for k in keys['keys'] ] raw = final.select(*exprs) time.sleep(2) raw.write.format("delta").mode("append").save( f"{s3_location}/{dest_delta_prefix}") #Handling insert transactions if (insert_df.count() != 0): i_df = insert_df.select("after.*", "op_type", "op_ts") i_df = i_df.select(*(col(c).cast("String").alias(c) for c in i_df.columns)) time.sleep(2) i_df.write.format("delta").mode("append").save( f"{s3_location}/{dest_delta_prefix}") #Handling delete transactions if (delete_df.count() != 0): d_df = delete_df.select("before.*", "op_type", "op_ts") d_df = d_df.select(*(col(c).cast("String").alias(c) for c in d_df.columns)) time.sleep(2) d_df.write.format("delta").mode("append").save( f"{s3_location}/{dest_delta_prefix}") #table creation to run window functions sql_context.sql("CREATE DATABASE IF NOT EXISTS test;") sql_context.sql(f"drop table if exists {table_name}_raw") sql_context.sql( f"CREATE TABLE {table_name}_raw USING DELTA LOCATION '{s3_location}/{dest_delta_prefix}';" ) logger.info( f"Delta write complete in path - {s3_location}/{dest_delta_prefix} . Table name - {table_name}" ) # To support S3 lag (https://issues.apache.org/jira/browse/SPARK-18512) time.sleep(5) #sort created table rows and filter delete transactions and old transactions ordered_df = sql_context.sql( f"select row_number() OVER (partition by {pk} order by {pk}, op_ts desc) as row_number, * from {table_name}_raw order by {pk}, op_ts desc " ) filter = ordered_df.where( "row_number =1 and op_type <> 'D'").select("*").drop( "op_ts", "op_type") #delete snapshot folder since mode(overwrite) has lag s3 = boto3.resource('s3') bucket = s3.Bucket(s3_source_bucket) for obj in bucket.objects.filter( Prefix=f"{s3_source_bucket_raw_prefix}/delta_snapshot"): s3.Object(bucket.name, obj.key).delete() filter.write.format("delta").mode("overwrite").save( f"{s3_location}/delta_snapshot/") #Optimize Delta table and vacuum files # sql_context.sql(f"CREATE TABLE {table_name}_snapshot USING DELTA LOCATION '{s3_location}/delta_snapshot/';") # sql_context.sql(f"OPTIMIZE {table_name}_snapshot") # sql_context.sql(f"VACUUM {table_name}_snapshot RETAIN 0 HOURS") except Exception as e: raise Exception('Exception caught - ' + str(e))
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ res_df = product_data.select( product_data.categories[0][0].alias(category_column), F.map_keys(product_data.salesRank)[0].alias(bestSalesCategory_column), F.map_values( product_data.salesRank)[0].alias(bestSalesRank_column)).replace( {'': None}, subset=[ category_column, bestSalesCategory_column, bestSalesRank_column ]) stats = res_df.agg( F.count("*").alias('count_total'), F.avg(bestSalesRank_column).alias('mean_bestSalesRank'), F.variance(bestSalesRank_column).alias('variance_bestSalesRank'), F.sum( F.isnull(category_column).cast('int')).alias('numNulls_category'), F.countDistinct(res_df.category).alias('countDistinct_category'), F.sum(F.isnull(bestSalesCategory_column).cast('int')).alias( 'numNulls_bestSalesCategory'), F.countDistinct(res_df.bestSalesCategory).alias( 'countDistinct_bestSalesCategory')).head() # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = int(stats[0]) res['mean_bestSalesRank'] = float(stats[1]) res['variance_bestSalesRank'] = float(stats[2]) res['numNulls_category'] = int(stats[3]) res['countDistinct_category'] = int(stats[4]) res['numNulls_bestSalesCategory'] = int(stats[5]) res['countDistinct_bestSalesCategory'] = int(stats[6]) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res
df3=df.rdd.map(lambda x: \ (x.name,x.properties["hair"],x.properties["eye"])) \ .toDF(["name","hair","eye"]) df3.printSchema() df3.show() df.withColumn("hair",df.properties.getItem("hair")) \ .withColumn("eye",df.properties.getItem("eye")) \ .drop("properties") \ .show() df.withColumn("hair",df.properties["hair"]) \ .withColumn("eye",df.properties["eye"]) \ .drop("properties") \ .show() from pyspark.sql.functions import explode df.select(df.name,explode(df.properties)).show() from pyspark.sql.functions import map_keys df.select(df.name,map_keys(df.properties)).show() from pyspark.sql.functions import map_values df.select(df.name,map_values(df.properties)).show() #from pyspark.sql.functions import explode,map_keys #keysDF = df.select(explode(map_keys(df.properties))).distinct() #keysList = keysDF.rdd.map(lambda x:x[0]).collect() #print(keysList)
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ first_item_ = product_data['categories'][0][0] procesed_ = product_data.withColumn(category_column, first_item_) null_ = F.when(procesed_.category == '', None).otherwise(procesed_.category) procesed_ = procesed_.withColumn(category_column, null_) map_key = F.map_keys('salesRank')[0] procesed_ = procesed_.withColumn('bestSalesCategory', map_key) map_value = F.map_values('salesRank')[0] procesed_ = procesed_.withColumn('bestSalesRank', map_value) count_total, mean_bestSalesRank, variance_bestSalesRank = procesed_.agg(F.count('asin'), \ F.mean('bestSalesRank'), F.variance('bestSalesRank')).collect()[0] countDistinct_category = procesed_.filter(procesed_["category"] != '') countDistinct_category = countDistinct_category.groupBy("category") countDistinct_category = countDistinct_category.agg( F.countDistinct("category")).count() sales = procesed_.select('bestSalesCategory').filter( \ procesed_.bestSalesCategory.isNotNull()) numNulls_bestSalesCategory, temp = procesed_.agg(F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int")),\ F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int"))).collect()[0] numNulls_category, temp = procesed_.agg(F.sum((F.isnull(procesed_[category_column])).cast("int")),\ F.sum((F.isnull(procesed_[category_column])).cast("int"))).collect()[0] countDistinct_bestSalesCategory, temp = \ sales.agg(F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory'),\ F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory')\ ).collect()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = count_total res['mean_bestSalesRank'] = mean_bestSalesRank res['variance_bestSalesRank'] = variance_bestSalesRank res['numNulls_category'] = numNulls_category res['countDistinct_category'] = countDistinct_category res['numNulls_bestSalesCategory'] = numNulls_bestSalesCategory res['countDistinct_bestSalesCategory'] = countDistinct_bestSalesCategory print(res) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res