def test_map_to_columns(spark):
    data = [("jose", {
        "a": "aaa",
        "b": "bbb"
    }), ("li", {
        "b": "some_letter",
        "z": "zed"
    })]
    df = spark.createDataFrame(data, ["first_name", "some_data"])
    # df.withColumn("some_data_a", F.col("some_data")["a"]).show()
    # df.show(truncate=False)
    # df.printSchema()
    df\
        .withColumn("some_data_a", F.col("some_data").getItem("a"))\
        .withColumn("some_data_b", F.col("some_data").getItem("b"))\
        .withColumn("some_data_z", F.col("some_data").getItem("z"))\
        # .show(truncate=False)

    cols = [F.col("first_name")] + list(
        map(lambda f: F.col("some_data").getItem(f).alias(str(f)),
            ["a", "b", "z"]))
    # df.select(cols).show()

    keys_df = df.select(F.explode(F.map_keys(F.col("some_data")))).distinct()
    # keys_df.show()

    keys = list(map(lambda row: row[0], keys_df.collect()))
    # print(keys)

    key_cols = list(
        map(lambda f: F.col("some_data").getItem(f).alias(str(f)), keys))
    # print(key_cols)

    final_cols = [F.col("first_name")] + key_cols
예제 #2
0
def get_distinct_keys(df, col_name, is_col_arr_map=False):
    """Return list of distinct keys.
    Set is_col_arr_map to be true if column is an array of Maps.
    Otherwise, assume column is a Map.
    """
    if is_col_arr_map:
        df = df.select(explode(col_name).alias(col_name))
    df = df.select(explode(map_keys(col_name)))
    return df.distinct().rdd.flatMap(lambda x: x).collect()
예제 #3
0
    def transform(self):
        """
        transforms the data with the given `self.schema` to the format suitable for the SQL queries

        :return: Pyspark DF
        """
        self.df = self.df\
            .withColumn('items', F.explode('items')) \
            .withColumn('ID', F.map_keys("items")[0]) \
            .withColumn('qp', F.map_values('items')[0]) \
            .withColumn('quantity', F.col('qp').getItem('quantity').astype('int')) \
            .withColumn('price', F.col('qp').getItem('price')) \
            .select(['user', 'timestamp', 'ID', 'quantity', 'price'])

        return self.df
예제 #4
0
def parquet_revalue(vcf, indel_com):
    temp = indel_com.join(vcf, ["#CHROM", "POS"], "full")
    sample_name = temp.columns[-1]

    sample_w = Window.partitionBy(F.col("#CHROM")).orderBy(
        F.col("POS")).rangeBetween(Window.unboundedPreceding,
                                   Window.currentRow)
    temp = temp.withColumn(
        sample_name,
        F.last(sample_name,
               ignorenulls=True).over(sample_w)).withColumnRenamed(
                   "#CHROM", "CHROM")

    # scala UDF
    null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\
                         .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\
                         .withColumn(sample_name,  F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":")))

    null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\
                     .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":")))

    value_union = null_not_value.union(null_value).withColumnRenamed(
        "CHROM", "#CHROM")
    return value_union
#! python3
# -*- coding: utf-8 -*-
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName(
    "PySpark example").enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
# Read data
df = spark.sql(
    "SELECT img_label FROM sprs_log_basis.model_server_log WHERE datepart=20190425 LIMIT 10"
)
df.cache()
df.show()
# Get keys
df = df.select(F.map_keys("img_label").alias("keys"))
# Assign index
df = df.withColumn("doc_id", F.monotonically_increasing_id())
NUM_doc = df.count()
# One hot words
df = df.select('*', F.explode('keys').alias('token'))
df.show()
# Calculate TF
TF = df.groupBy("doc_id").agg(F.count("token").alias("doc_len")) \
    .join(df.groupBy("doc_id", "token")
          .agg(F.count("keys").alias("word_count")), ['doc_id']) \
    .withColumn("tf", F.col("word_count") / F.col("doc_len")) \
    .drop("doc_len", "word_count")
TF.cache()
# Calculate IDF
IDF = df.groupBy("token").agg(F.countDistinct("doc_id").alias("df"))
예제 #6
0
schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(), StringType()), True)
])
df2 = spark.createDataFrame(data=dataDictionary, schema=schema)
df2.printSchema()
df2.show(truncate=False)

df3=df.rdd.map(lambda x: \
    (x.name,x.properties["hair"],x.properties["eye"])) \
    .toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

df.withColumn("hair",df.properties.getItem("hair")) \
  .withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties") \
  .show()

df.withColumn("hair",df.properties["hair"]) \
  .withColumn("eye",df.properties["eye"]) \
  .drop("properties") \
  .show()

# Functions
from pyspark.sql.functions import explode, map_keys, col
keysDF = df.select(explode(map_keys(df.properties))).distinct()
keysList = keysDF.rdd.map(lambda x: x[0]).collect()
keyCols = list(
    map(lambda x: col("properties").getItem(x).alias(str(x)), keysList))
df.select(df.name, *keyCols).show()
예제 #7
0
파일: dsc102pa2.py 프로젝트: jonxsong/ESFSA
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    ### category
    data = product_data.withColumn(
        category_column,
        F.when(F.col(categories_column)[0][0] == '',
               None).otherwise(F.col(categories_column)[0][0]))
    category_nulls = data.filter(F.col(category_column).isNull()).count()
    category_distinct = data.agg(F.countDistinct(
        F.col(category_column))).head()[0]

    ### salesRank and salesCategory
    key_and_values = data.select(
        asin_column, category_column,
        F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column),
        F.map_values(salesRank_column)[0].alias(bestSalesRank_column))

    mean_of_salesRank = key_and_values.select(
        F.avg(F.col(bestSalesRank_column))).head()[0]
    variance_of_salesRank = key_and_values.select(
        F.variance(F.col(bestSalesRank_column))).head()[0]

    salesCategory_nulls = key_and_values.filter(
        F.col(bestSalesCategory_column).isNull()).count()
    salesCategory_distinct = key_and_values.agg(
        F.countDistinct(F.col(bestSalesCategory_column))).head()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:

    res['count_total'] = data.count()
    res['mean_bestSalesRank'] = mean_of_salesRank
    res['variance_bestSalesRank'] = variance_of_salesRank
    res['numNulls_category'] = category_nulls
    res['countDistinct_category'] = category_distinct
    res['numNulls_bestSalesCategory'] = salesCategory_nulls
    res['countDistinct_bestSalesCategory'] = salesCategory_distinct

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
예제 #8
0
def write_delta_lookup_table(s3_source_bucket: str,
                             s3_source_bucket_raw_prefix: str,
                             dest_delta_prefix: str, table_name: str,
                             date_hour_path: str, pk: str):
    try:
        s3_connection = S3Connection(default_bucket=None)
        spark = get_spark_session()
        spark.conf.set("spark.sql.mapKeyDedupPolicy", 'LAST_WIN')
        spark_context = spark.sparkContext
        sql_context = spark.builder.getOrCreate()
        logger = prefect.context.get("logger")

        s3_location = f"s3a://{s3_source_bucket}/{s3_source_bucket_raw_prefix}"
        hour_data_path = f"{s3_location}/{date_hour_path}"
        file_list = s3_connection.read_s3_folder(
            s3_folder_path=f"{s3_source_bucket_raw_prefix}/{date_hour_path}",
            s3_bucket=s3_source_bucket)
        logger.info(f"file_list --> {file_list}")
        if len(file_list) <= 0:
            logger.warn(f"No transaction data in path - {hour_data_path}")
        else:
            """
            lookup_table_df = spark_context.textFile(hour_data_path)
            lookup_json_df = sql_context.read.json(lookup_table_df)
            lookup_snapshot_df = lookup_json_df.select("op_type", "op_ts","after.*")
            logger.info(f"Delta_raw path - {s3_location}/{dest_delta_prefix}")
            lookup_snapshot_df.write.format("delta").mode("append").save(f"{s3_location}/{dest_delta_prefix}")
            sql_context.sql("CREATE DATABASE IF NOT EXISTS test;")
            sql_context.sql(f"CREATE TABLE {table_name}_raw USING DELTA LOCATION '{s3_location}/{dest_delta_prefix}';")
            logger.info(f"Delta write complete in path - {s3_location}/{dest_delta_prefix} . Table name - {table_name}")

            filter = sql_context.sql(f"select row_number() OVER (partition by LIN_OF_BUS_ID order by LIN_OF_BUS_ID, op_ts desc) as row_number, * from {table_name}_raw order by LIN_OF_BUS_ID, current_ts desc ")

            filter.where("row_number =1 and op_type <> 'D'").select("*").drop("op_ts","op_type").write.format("delta").mode("overwrite").save(
                f"{s3_location}/delta_snapshot/")
            """

            lookup_table_df = spark_context.textFile(hour_data_path)
            cleansed_df = lookup_table_df.map(lambda s: s.replace(
                "{\"table", "[{\"table", 1)).map(lambda s: s.replace(
                    "}{\"table", "},{\"table")).map(lambda s: s.replace(
                        "}}", "}}]")).map(lambda s: s.replace("}}],", "}},"))
            lookup_json_df = sql_context.read.json(cleansed_df)

            update_df = lookup_json_df.where("op_type='U'").select("*")
            insert_df = lookup_json_df.where("op_type='I'").select("*")
            delete_df = lookup_json_df.where("op_type='D'").select("*")

            #Handling update transactions
            if (update_df.count() != 0):
                lookup_after = update_df.select("after.*")
                lookup_before = update_df.select("before.*")
                logger.info(f'op_ts - {list((lit("op_ts"),"op_ts"))}')
                op_ts = list((lit("op_ts"), "op_ts"))
                op_type = list((lit("op_type"), "op_type"))
                op_ts.extend(op_type)

                a = list(
                    chain(*((lit(name), ("after." + name))
                            for name in lookup_after.columns)))
                a.extend(op_ts)
                b = list(
                    chain(*((lit(name), ("before." + name))
                            for name in lookup_before.columns)))
                b.extend(op_ts)

                after = create_map(a).alias("after")
                before = create_map(b).alias("before")
                final = update_df.select(
                    map_concat(before, after).alias("final"))

                keys = final.select(map_keys("final").alias("keys")).first()
                exprs = [
                    col("final").getItem(k).alias(k) for k in keys['keys']
                ]
                raw = final.select(*exprs)
                time.sleep(2)
                raw.write.format("delta").mode("append").save(
                    f"{s3_location}/{dest_delta_prefix}")

            #Handling insert transactions
            if (insert_df.count() != 0):
                i_df = insert_df.select("after.*", "op_type", "op_ts")
                i_df = i_df.select(*(col(c).cast("String").alias(c)
                                     for c in i_df.columns))
                time.sleep(2)
                i_df.write.format("delta").mode("append").save(
                    f"{s3_location}/{dest_delta_prefix}")

            #Handling delete transactions
            if (delete_df.count() != 0):
                d_df = delete_df.select("before.*", "op_type", "op_ts")
                d_df = d_df.select(*(col(c).cast("String").alias(c)
                                     for c in d_df.columns))
                time.sleep(2)
                d_df.write.format("delta").mode("append").save(
                    f"{s3_location}/{dest_delta_prefix}")

            #table creation to run window functions
            sql_context.sql("CREATE DATABASE IF NOT EXISTS test;")
            sql_context.sql(f"drop table if exists {table_name}_raw")
            sql_context.sql(
                f"CREATE TABLE {table_name}_raw USING DELTA LOCATION '{s3_location}/{dest_delta_prefix}';"
            )
            logger.info(
                f"Delta write complete in path - {s3_location}/{dest_delta_prefix} . Table name - {table_name}"
            )
            # To support S3 lag (https://issues.apache.org/jira/browse/SPARK-18512)
            time.sleep(5)
            #sort created table rows and filter delete transactions and old transactions
            ordered_df = sql_context.sql(
                f"select row_number() OVER (partition by {pk} order by {pk}, op_ts desc) as row_number, * from {table_name}_raw order by {pk}, op_ts desc "
            )
            filter = ordered_df.where(
                "row_number =1 and op_type <> 'D'").select("*").drop(
                    "op_ts", "op_type")

            #delete snapshot folder since mode(overwrite) has lag
            s3 = boto3.resource('s3')
            bucket = s3.Bucket(s3_source_bucket)
            for obj in bucket.objects.filter(
                    Prefix=f"{s3_source_bucket_raw_prefix}/delta_snapshot"):
                s3.Object(bucket.name, obj.key).delete()
            filter.write.format("delta").mode("overwrite").save(
                f"{s3_location}/delta_snapshot/")

            #Optimize Delta table and vacuum files
            # sql_context.sql(f"CREATE TABLE {table_name}_snapshot USING DELTA LOCATION '{s3_location}/delta_snapshot/';")
            # sql_context.sql(f"OPTIMIZE {table_name}_snapshot")
            # sql_context.sql(f"VACUUM {table_name}_snapshot RETAIN 0 HOURS")
    except Exception as e:
        raise Exception('Exception caught - ' + str(e))
예제 #9
0
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    res_df = product_data.select(
        product_data.categories[0][0].alias(category_column),
        F.map_keys(product_data.salesRank)[0].alias(bestSalesCategory_column),
        F.map_values(
            product_data.salesRank)[0].alias(bestSalesRank_column)).replace(
                {'': None},
                subset=[
                    category_column, bestSalesCategory_column,
                    bestSalesRank_column
                ])

    stats = res_df.agg(
        F.count("*").alias('count_total'),
        F.avg(bestSalesRank_column).alias('mean_bestSalesRank'),
        F.variance(bestSalesRank_column).alias('variance_bestSalesRank'),
        F.sum(
            F.isnull(category_column).cast('int')).alias('numNulls_category'),
        F.countDistinct(res_df.category).alias('countDistinct_category'),
        F.sum(F.isnull(bestSalesCategory_column).cast('int')).alias(
            'numNulls_bestSalesCategory'),
        F.countDistinct(res_df.bestSalesCategory).alias(
            'countDistinct_bestSalesCategory')).head()

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:

    res['count_total'] = int(stats[0])
    res['mean_bestSalesRank'] = float(stats[1])
    res['variance_bestSalesRank'] = float(stats[2])
    res['numNulls_category'] = int(stats[3])
    res['countDistinct_category'] = int(stats[4])
    res['numNulls_bestSalesCategory'] = int(stats[5])
    res['countDistinct_bestSalesCategory'] = int(stats[6])
    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
df3=df.rdd.map(lambda x: \
    (x.name,x.properties["hair"],x.properties["eye"])) \
    .toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

df.withColumn("hair",df.properties.getItem("hair")) \
  .withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties") \
  .show()

df.withColumn("hair",df.properties["hair"]) \
  .withColumn("eye",df.properties["eye"]) \
  .drop("properties") \
  .show()

from pyspark.sql.functions import explode
df.select(df.name,explode(df.properties)).show()

from pyspark.sql.functions import map_keys
df.select(df.name,map_keys(df.properties)).show()

from pyspark.sql.functions import map_values
df.select(df.name,map_values(df.properties)).show()

#from pyspark.sql.functions import explode,map_keys
#keysDF = df.select(explode(map_keys(df.properties))).distinct()
#keysList = keysDF.rdd.map(lambda x:x[0]).collect()
#print(keysList)

예제 #11
0
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    first_item_ = product_data['categories'][0][0]
    procesed_ = product_data.withColumn(category_column, first_item_)
    null_ = F.when(procesed_.category == '',
                   None).otherwise(procesed_.category)
    procesed_ = procesed_.withColumn(category_column, null_)
    map_key = F.map_keys('salesRank')[0]
    procesed_ = procesed_.withColumn('bestSalesCategory', map_key)
    map_value = F.map_values('salesRank')[0]
    procesed_ = procesed_.withColumn('bestSalesRank', map_value)
    count_total, mean_bestSalesRank, variance_bestSalesRank = procesed_.agg(F.count('asin'), \
      F.mean('bestSalesRank'), F.variance('bestSalesRank')).collect()[0]

    countDistinct_category = procesed_.filter(procesed_["category"] != '')
    countDistinct_category = countDistinct_category.groupBy("category")
    countDistinct_category = countDistinct_category.agg(
        F.countDistinct("category")).count()

    sales = procesed_.select('bestSalesCategory').filter( \
        procesed_.bestSalesCategory.isNotNull())

    numNulls_bestSalesCategory, temp =  procesed_.agg(F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int")),\
                                                    F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int"))).collect()[0]
    numNulls_category, temp =  procesed_.agg(F.sum((F.isnull(procesed_[category_column])).cast("int")),\
                                                    F.sum((F.isnull(procesed_[category_column])).cast("int"))).collect()[0]
    countDistinct_bestSalesCategory, temp = \
    sales.agg(F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory'),\
                 F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory')\
                ).collect()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:
    res['count_total'] = count_total
    res['mean_bestSalesRank'] = mean_bestSalesRank
    res['variance_bestSalesRank'] = variance_bestSalesRank
    res['numNulls_category'] = numNulls_category
    res['countDistinct_category'] = countDistinct_category
    res['numNulls_bestSalesCategory'] = numNulls_bestSalesCategory
    res['countDistinct_bestSalesCategory'] = countDistinct_bestSalesCategory

    print(res)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res