예제 #1
0
def test_aggregate(spark, root, args):
    raw = spark.read.json(str(root / "server_a" / "raw"))
    internal = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify2")
    )
    external = spark.read.json(
        str(root / "server_a" / "intermediate" / "external" / "verify2")
    )

    aggregates = (
        raw.select("id", F.unbase64("payload").alias("shares"))
        .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id")
        .join(external.select("id", F.unbase64("payload").alias("external")), on="id")
        .groupBy()
        .applyInPandas(
            lambda pdf: udf.aggregate(
                args.batch_id,
                args.n_data,
                args.server_id,
                args.private_key_hex,
                args.shared_secret,
                args.public_key_hex_internal,
                args.public_key_hex_external,
                pdf,
            ),
            schema="payload binary, error int, total int",
        )
    )
    aggregates.show()
    rows = aggregates.collect()
    assert len(rows) == 1
    assert rows[0].total == 5
    assert rows[0].error == 0
예제 #2
0
def test_total_share(spark, root, args):
    raw = spark.read.json(str(root / "server_a" / "raw"))
    internal = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify2")
    )
    external = spark.read.json(
        str(root / "server_a" / "intermediate" / "external" / "verify2")
    )

    aggregates = (
        raw.select("id", F.unbase64("payload").alias("shares"))
        .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id")
        .join(external.select("id", F.unbase64("payload").alias("external")), on="id")
        .repartition(2)
        .withColumn("pid", F.spark_partition_id())
        .groupBy("pid")
        .applyInPandas(
            lambda pdf: udf.aggregate(
                args.batch_id,
                args.n_data,
                args.server_id,
                args.private_key_hex,
                args.shared_secret,
                args.public_key_hex_internal,
                args.public_key_hex_external,
                pdf,
            ),
            schema="payload binary, error int, total int",
        )
    )
    aggregates.show()
    rows = aggregates.collect()
    assert len(rows) == 2
    assert {2, 3} == set(r.total for r in rows)
    assert all(r.error == 0 for r in rows)

    total_share = aggregates.groupBy().applyInPandas(
        lambda pdf: udf.total_share(
            args.batch_id,
            args.n_data,
            args.server_id,
            args.private_key_hex,
            args.shared_secret,
            args.public_key_hex_internal,
            args.public_key_hex_external,
            pdf,
        ),
        schema="payload binary, error int, total int",
    )
    total_share.show()

    rows = total_share.collect()
    assert len(rows) == 1
    assert len(rows[0].payload) > 0
    assert rows[0].total == 5
    assert rows[0].error == 0
예제 #3
0
def top_k_businesses(spark: SparkSession, k: int = 3, ascending: bool = False):
    df = spark.read.csv(l_file[1],header=True,sep="\t")
    
    #unbase
    df = df.withColumn("review_text", unbase64(df["review_text"]).cast("string"))

    #tokenize
    tokenizer = Tokenizer(inputCol="review_text", outputCol="temp")
    df = tokenizer.transform(df)
    df = df.drop("review_text")

    #stopwords
    stopwords = get_stopword_list()
    remover = StopWordsRemover(inputCol="temp", outputCol="words", stopWords=stopwords)
    df = remover.transform(df)
    df = df.drop("temp")
    df = df.select(df.business_id, explode(df.words))

    #punctuation
    commaRep = udf(lambda x: x.translate(str.maketrans('','', string.punctuation)))
    df = df.withColumn("col", commaRep(df["col"]))

    #sentiment
    afinn = get_sentiment_dict()
    sentimentRep = udf(lambda x: afinn.get(x,0))
    df = df.withColumn("col", sentimentRep(df["col"]))

    #summing on business
    df = df.groupBy("business_id")
    result = df.agg({'col':'sum'}).orderBy("sum(col)",ascending=ascending).take(k)
    return result
예제 #4
0
def verify1(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    output,
):
    """Decode a batch of shares"""
    click.echo("Running verify1")
    spark = spark_session()

    df = spark.read.json(input).select(
        "id",
        F.pandas_udf(
            partial(
                udf.verify1,
                batch_id,
                n_data,
                server_id,
                private_key_hex,
                b64decode(shared_secret),
                public_key_hex_internal,
                public_key_hex_external,
            ),
            returnType="binary",
        )(F.unbase64("payload")).alias("payload"),
    )
    valid = df.where("payload is not null")
    # NOTE: invalid set can be written out, but maybe require work to be done twice
    valid.write.json(output, mode="overwrite")
예제 #5
0
def test_verify1_bad_data(spark, root, args):
    # we can use server b's data
    a = spark.read.json(str(root / "server_a" / "raw"))
    b = spark.read.json(str(root / "server_b" / "raw"))
    df = spark.createDataFrame(a.union(b).collect() + [Row(id=None, payload=None)])

    actual = df.select(
        "id",
        F.pandas_udf(
            partial(
                udf.verify1,
                args.batch_id,
                args.n_data,
                args.server_id,
                args.private_key_hex,
                args.shared_secret,
                args.public_key_hex_internal,
                args.public_key_hex_external,
            ),
            returnType="binary",
        )(F.unbase64("payload")).alias("expected_payload"),
    )
    actual.show()

    assert actual.where("expected_payload IS NOT NULL").count() == a.count()
    assert actual.where("expected_payload IS NULL").count() == b.count() + 1
예제 #6
0
def detokenize(column):
    """
    This function to detokenise using unbase64
    :param column: pass column to detokenize
    :return: untokenised value
    """
    return fn.unbase64(column).cast("string")
예제 #7
0
def verify2(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    input_internal,
    input_external,
    output,
):
    """Verify a batch of SNIPs"""
    click.echo("Running verify2")
    spark = spark_session()
    shares = spark.read.json(input)
    internal = spark.read.json(input_internal)
    external = spark.read.json(input_external)
    df = (shares.select("id",
                        F.unbase64("payload").alias("shares")).join(
                            internal.select(
                                "id",
                                F.unbase64("payload").alias("internal")),
                            on="id").join(external.select(
                                "id",
                                F.unbase64("payload").alias("external")),
                                          on="id").select(
                                              "id",
                                              F.pandas_udf(
                                                  partial(
                                                      udf.verify2,
                                                      batch_id,
                                                      n_data,
                                                      server_id,
                                                      private_key_hex,
                                                      b64decode(shared_secret),
                                                      public_key_hex_internal,
                                                      public_key_hex_external,
                                                  ),
                                                  returnType="binary",
                                              )("shares", "internal",
                                                "external").alias("payload"),
                                          ))
    valid = df.where("payload is not null")
    valid.write.json(output, mode="overwrite")
예제 #8
0
def aggregate(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    input_internal,
    input_external,
    output,
):
    """Generate an aggregate share from a batch of verified SNIPs"""
    click.echo("Running aggregate")
    spark = spark_session()
    shares = spark.read.json(input)
    internal = spark.read.json(input_internal)
    external = spark.read.json(input_external)

    args = [
        batch_id,
        n_data,
        server_id,
        private_key_hex,
        b64decode(shared_secret),
        public_key_hex_internal,
        public_key_hex_external,
    ]
    (shares.join(internal.withColumnRenamed("payload", "internal"),
                 on="id").join(
                     external.withColumnRenamed("payload", "external"),
                     on="id").select(
                         F.unbase64("payload").alias("shares"),
                         F.unbase64("internal").alias("internal"),
                         F.unbase64("external").alias("external"),
                         F.spark_partition_id().alias("pid"),
                     ).groupBy("pid").applyInPandas(
                         lambda pdf: udf.aggregate(*args, pdf),
                         schema="payload: binary, error: int, total: int",
                     ).groupBy().applyInPandas(
                         lambda pdf: udf.total_share(*args, pdf),
                         schema="payload: binary, error: int, total: int",
                     ).withColumn("payload", F.base64("payload"))).write.json(
                         output, mode="overwrite")
예제 #9
0
def generate_frames(display_loaded=False, outputs=False):

    # Read csv files, specify column types and generate frames
    business_schema = StructType()\
        .add("business_id", StringType(), True)\
        .add("name", StringType(), True)\
        .add("address", StringType(), True)\
        .add("city", StringType(), True)\
        .add("state", StringType(), True)\
        .add("postal_code", StringType(), True)\
        .add("latitude", FloatType(), True)\
        .add("longitude", FloatType(), True)\
        .add("stars", FloatType(), True)\
        .add("review_count", IntegerType(), True)\
        .add("categories", StringType(), True)

    business_frame = spark.read.format('csv')\
        .options(header='true', delimiter="	")\
        .schema(business_schema)\
        .load('./yelp_businesses.csv')

    top_review_schema = StructType() \
        .add("review_id", StringType()) \
        .add("user_id", StringType()) \
        .add("business_id", StringType()) \
        .add("review_text", StringType()) \
        .add("review_date", StringType()) # Will be converted to TimestampType

    top_review_frame = spark.read.format('csv')\
        .options(header='true', delimiter="	") \
        .schema(top_review_schema) \
        .load('./yelp_top_reviewers_with_reviews.csv')

    # Decode review_text, convert review_date to TimestampType
    top_review_frame = top_review_frame\
        .withColumn('review_text', unbase64(top_review_frame.review_text).cast(StringType()))\
        .withColumn('review_date', from_unixtime(top_review_frame.review_date).cast(TimestampType()))

    friendship_graph_frame = spark.read.format('csv')\
        .options(header='true', inferSchema='true')\
        .load('./yelp_top_users_friendship_graph.csv')

    if outputs:
        to_console = "5a:\nData frame objects, showing the columns and types: \n"
        to_console += "Yelp Businesses: " + str(business_frame) + "\n"
        to_console += "Yelp Top Reviewers: " + str(top_review_frame) + "\n"
        to_console += "Yelp Top Users Friendship Graph: " + str(
            friendship_graph_frame) + "\n"
        print(to_console)

    # Display frames (for testing)
    if display_loaded:
        business_frame.show()
        top_review_frame.show()
        friendship_graph_frame.show()

    return business_frame, top_review_frame, friendship_graph_frame
예제 #10
0
 def read_df(uid):
     server_root = root / f"server_{uid}"
     raw = spark.read.json(str(server_root / "raw"))
     internal = spark.read.json(
         str(server_root / "intermediate" / "internal" / "verify2")
     )
     external = spark.read.json(
         str(server_root / "intermediate" / "external" / "verify2")
     )
     return (
         raw.select("id", F.unbase64("payload").alias("shares"))
         .join(
             internal.select("id", F.unbase64("payload").alias("internal")), on="id"
         )
         .join(
             external.select("id", F.unbase64("payload").alias("external")), on="id"
         )
     )
예제 #11
0
def test_verify2(spark, root, args):
    raw = spark.read.json(str(root / "server_a" / "raw"))
    internal = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify1")
    )
    external = spark.read.json(
        str(root / "server_a" / "intermediate" / "external" / "verify1")
    )

    actual = (
        raw.select("id", F.unbase64("payload").alias("shares"))
        .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id")
        .join(external.select("id", F.unbase64("payload").alias("external")), on="id")
        .select(
            "id",
            F.base64(
                F.pandas_udf(
                    partial(
                        udf.verify2,
                        args.batch_id,
                        args.n_data,
                        args.server_id,
                        args.private_key_hex,
                        args.shared_secret,
                        args.public_key_hex_internal,
                        args.public_key_hex_external,
                    ),
                    returnType="binary",
                )("shares", "internal", "external")
            ).alias("expected_payload"),
        )
    )

    expected = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify2")
    )

    joined = actual.join(expected, on="id")
    assert joined.where("length(expected_payload) <> length(payload)").count() == 0
예제 #12
0
def publish(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input_internal,
    input_external,
    output,
):
    """Generate a final aggregate."""
    click.echo("Running publish")

    spark = spark_session()
    (spark.read.json(input_internal).withColumn(
        "server", F.lit("internal")).union(
            spark.read.json(input_external).withColumn(
                "server", F.lit("external"))).withColumn(
                    "payload", F.unbase64("payload")).groupBy().pivot(
                        "server", ["internal", "external"]).
     agg(*[F.min(c).alias(c) for c in ["payload", "error", "total"]]).select(
         F.udf(lambda: str(uuid4()), returnType="string")().alias("id"),
         F.lit(datetime.utcnow().isoformat()).alias("timestamp"),
         F.pandas_udf(
             partial(
                 udf.publish,
                 batch_id,
                 n_data,
                 server_id,
                 private_key_hex,
                 b64decode(shared_secret),
                 public_key_hex_internal,
                 public_key_hex_external,
             ),
             returnType="array<int>",
         )("internal_payload", "external_payload").alias("payload"),
         F.col("internal_error").alias("error"),
         F.col("internal_total").alias("total"),
     ).write.json(output, mode="overwrite"))
예제 #13
0
def test_publish(spark, tmp_path, root, args):
    expect = json.loads(next((root / "server_a" / "processed").glob("*")).read_text())
    internal = root / "server_a" / "intermediate" / "internal" / "aggregate"
    external = root / "server_a" / "intermediate" / "external" / "aggregate"
    df = (
        spark.read.json(str(internal))
        .withColumn("server", F.lit("internal"))
        .union(spark.read.json(str(external)).withColumn("server", F.lit("external")))
        .withColumn("payload", F.unbase64("payload"))
        .groupBy()
        .pivot("server", ["internal", "external"])
        .agg(
            F.min("payload").alias("payload"),
            F.min("error").alias("error"),
            F.min("total").alias("total"),
        )
        .select(
            F.pandas_udf(
                partial(
                    udf.publish,
                    args.batch_id,
                    args.n_data,
                    args.server_id,
                    args.private_key_hex,
                    args.shared_secret,
                    args.public_key_hex_internal,
                    args.public_key_hex_external,
                ),
                returnType="array<int>",
            )("internal_payload", "external_payload").alias("payload"),
            F.col("internal_error").alias("error"),
            F.col("internal_total").alias("total"),
        )
    )
    assert df.count() == 1
    row = df.first()
    assert row.payload == expect["payload"]
    assert row.error == 0
    assert row.total == 5
예제 #14
0
def test_verify1(spark, root, args):
    df = spark.read.json(str(root / "server_a" / "raw"))
    df.show(vertical=True, truncate=100)

    actual = df.select(
        "id",
        F.base64(
            F.pandas_udf(
                partial(
                    udf.verify1,
                    args.batch_id,
                    args.n_data,
                    args.server_id,
                    args.private_key_hex,
                    args.shared_secret,
                    args.public_key_hex_internal,
                    args.public_key_hex_external,
                ),
                returnType="binary",
            )(F.unbase64("payload"))
        ).alias("expected_payload"),
    )

    expected = spark.read.json(
        str(root / "server_a" / "intermediate" / "internal" / "verify1")
    )

    joined = actual.join(expected, on="id")
    joined.show(vertical=True, truncate=100)

    # NOTE: Payloads are only the same if they are processed in a deterministic
    # order using the same context due to the pseudorandom seed. The CLI
    # application assumes the same server context across all of the rows in a
    # partition. However, the UDF approach will generate a new server context
    # for every row.
    assert joined.where("length(expected_payload) <> length(payload)").count() == 0
예제 #15
0
def clean_MES(df):
    df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8'))
    return flatten_df(
        flatten_df(
            df_decoded.withColumn(
                'Body',
                from_json(
                    col('Body'),
                    StructType([
                        StructField("dataItemType", StringType(), True),
                        StructField("assetId", StringType(), True),
                        StructField("value", StringType(), True)
                    ], )))).drop(col('SystemProperties')).
        withColumn(
            'SystemProperties_connectionAuthMethod',
            from_json(
                col('SystemProperties_connectionAuthMethod'),
                StructType([
                    StructField("scope", StringType(), True),
                    StructField("type", StringType(), True),
                    StructField("issuer", StringType(), True),
                    StructField("acceptingIpFilterRule", StringType(), True)
                ], ))).withColumn(
                    'Body_Value',
                    from_json(
                        col('Body_Value'),
                        StructType([
                            StructField("eventId", StringType(), True),
                            StructField("assetId", StringType(), True),
                            StructField("telemetryValue", StringType(), True),
                            StructField("description", StringType(), True),
                            StructField("dateTime", StringType(),
                                        True),
                            StructField("componentName", StringType(), True),
                            StructField("status", StringType(), True)
                        ], ))))
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
#+--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn(
    'encodedCustomer',
    unbase64(zSetEntriesEncodedStreamingDF.encodedCustomer).cast('string'))

# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords
zSetEntriesDecodedStreamingDF.withColumn(
    'encodedCustomer',
    from_json('encodedCustomer', customerMessageSchema)).select(
        col('encodedCustomer.*')).createOrReplaceTempView('CustomerRecords')

# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql(
    'select customerName, email, phone, birthDay from CustomerRecords where birthDay is not null'
)

# TO-DO: from the emailAndBirthDayStreamingDF dataframe select the email and the birth year (using the split function)

# TO-DO: Split the birth year as a separate field from the birthday
redisServerStreamingDF = redisServerRawStreamingDF.selectExpr(
    "cast(key as string) key", "cast(value as string) value")

# this creates a temporary streaming view based on the streaming dataframe
# it can later be queried with spark.sql, we will cover that in the next section
redisServerStreamingDF.withColumn("value",from_json("value",redisMessageSchema))\
        .select(col('value.*')) \
        .createOrReplaceTempView("RedisData")

# Using spark.sql we can select any valid select statement from the spark view
zSetEntriesEncodedStreamingDF = spark.sql(
    "select key, zSetEntries[0].element as customer from RedisData")

zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn(
    "customer",
    unbase64(zSetEntriesEncodedStreamingDF.customer).cast("string"))

zSetDecodedEntriesStreamingDF\
    .withColumn("customer", from_json("customer", customerJSONSchema))\
    .select(col('customer.*'))\
    .createOrReplaceTempView("Customer")\

customerStreamingDF = spark.sql(
    "select accountNumber, location, birthDay from Customer where birthDay is not null"
)

relevantCustomerFieldsStreamingDF = customerStreamingDF.select(
    'accountNumber', 'location',
    split(customerStreamingDF.birthDay, "-").getItem(0).alias("birthYear"))

# this takes the stream and "sinks" it to the console as it is updated one message at a time (null means the JSON parsing didn't match the fields in the schema):
redis_server_raw_df = spark\
    .readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:19092") \
    .option("subscribe", "redis-server") \
    .option("startingOffsets", "earliest") \
    .load()

redis_server_raw_df = redis_server_raw_df.selectExpr("cast(key as string) key", "cast(value as string) value")
redis_server_raw_df\
    .withColumn("value", from_json("value", redis_server_schema))\
    .select(col("value.*"))\
    .createOrReplaceTempView("RedisSortedSet")

customers_df = spark.sql("select key, zSetEntries[0].element as encodedCustomer from RedisSortedSet")
customers_df = customers_df.withColumn("encodedCustomer", unbase64(customers_df.encodedCustomer).cast("string"))

customers_df\
    .withColumn("encodedCustomer", from_json("encodedCustomer", customer_schema))\
    .select(col("encodedCustomer.*"))\
    .createOrReplaceTempView("CustomerRecords")

emailAndBirthDayStreamingDF = spark.sql("select email, birthDay from CustomerRecords "
                                        "where email is not null and birthDay is not null")
emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF\
    .select("email", split(col("birthDay"), "-").getItem(0).alias("birthYear"))

stedi_events_raw_df = spark\
    .readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:19092") \
예제 #19
0
redisServerStreamingDF = redisServerRawStreamingDF.selectExpr(
    "cast(key as string) key", "cast(value as string) value")

# this creates a temporary streaming view based on the streaming dataframe
# it can later be queried with spark.sql, we will cover that in the next section
redisServerStreamingDF.withColumn("value",from_json("value",redisMessageSchema))\
        .select(col('value.*')) \
        .createOrReplaceTempView("RedisData")

# Using spark.sql we can select any valid select statement from the spark view
zSetEntriesEncodedStreamingDF = spark.sql(
    "select key, zSetEntries[0].element as payment from RedisData")

zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn(
    "payment",
    unbase64(zSetEntriesEncodedStreamingDF.payment).cast("string"))

zSetDecodedEntriesStreamingDF\
    .withColumn("payment", from_json("payment", paymentJSONSchema))\
    .select(col('payment.*'))\
    .createOrReplaceTempView("Payment")\

paymentStreamingDF = spark.sql("select reservationId, amount from Payment")

# this takes the stream and "sinks" it to the console as it is updated one message at a time (null means the JSON parsing didn't match the fields in the schema):
# {"reservationId":"9856743232","amount":"946.88"}

paymentStreamingDF.selectExpr("CAST(reservationId AS STRING) AS key", "to_json(struct(*)) AS value")\
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092")\
# cast the value column in the streaming dataframe as a STRING
redis_server_streaming_df = redis_server_raw_streaming_df.selectExpr(
    "cast(key as string) key", "cast(value as string) value")

redis_server_streaming_df.withColumn(
    "value", from_json("value", redis_server_schema)).select(
        col('value.*')).createOrReplaceTempView("RedisSortedSet")

# execute a sql statement against a temporary view, which statement takes the element field from the 0th element in the array of structs and create a column called encodedCustomer
# the reason we do it this way is that the syntax available select against a view is different than a dataframe, and it makes it easy to select the nth element of an array in a sql column
encoded_streaming_df = spark.sql(
    "select key, zSetEntries[0].element as customer from RedisSortedSet")

decoded_streaming_df = encoded_streaming_df.withColumn(
    "customer",
    unbase64(encoded_streaming_df.customer).cast("string"))

# parse the JSON in the Customer record and store in a temporary view called CustomerRecords
decoded_streaming_df.withColumn(
    "customer", from_json("customer", customer_json_schema)).select(
        col('customer.*')).createOrReplaceTempView("CustomerRecords")

# JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
email_bday_streaming_df = spark.sql(
    "select email, birthDay from CustomerRecords where birthDay is not null")

# Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
email_birth_year_streaming_df = email_bday_streaming_df.select(
    'email',
    split(email_bday_streaming_df.birthDay, "-").getItem(0).alias("birthYear"))
예제 #21
0
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
# +--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
encodedCustomerDF = encodedCustomerDF.withColumn(
    "encodedCustomer", unbase64(encodedCustomerDF.encodedCustomer).cast("string"))
# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords
encodedCustomerDF\
    .withColumn("encodedCustomer", from_json("encodedCustomer", redisCustomerMessageSchema))\
    .select(col('encodedCustomer.*'))\
    .createOrReplaceTempView("CustomerRecords")


# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql(
    "select email, birthday from CustomerRecords where email is not null and birthDay is not null")

# TO-DO: Split the birth year as a separate field from the birthday
emailAndBirthDayStreamingDF = emailAndBirthDayStreamingDF.withColumn(
    "birthYear", split("CustomerRecords.birthDay", "-").getItem(0).alias("birthYear"))
예제 #22
0
# TO-DO: take the encodedCustomer column which is base64 encoded at first like this:
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
# +--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
df = encDf.withColumn("customer", unbase64(encDf.customer).cast("string"))

# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords
df.withColumn("customer", from_json("customer", customersSchema)).select(
    col("customer.*")).createOrReplaceTempView("CustomerRecords")

# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql(
    "select email, birthDay from CustomerRecords where email is not null and birthDay is not null"
)

# TO-DO: Split the birth year as a separate field from the birthday
# TO-DO: Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF.select(
    "email",
    split(emailAndBirthDayStreamingDF.birthDay,
예제 #23
0
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and converting it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
#+--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
decodedCustomer = encodedCustomer.withColumn(
    "customer",
    unbase64(encodedCustomer.customer).cast("string"))

# Parsing the JSON in the Customer record and store in a temporary view called CustomerRecords
decodedCustomer\
    .withColumn("customer", from_json("customer",customerInfoJSONschema))\
    .select(col("customer.*"))\
    .createOrReplaceTempView("CustomerRecords")

# JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql(
    "select email, birthDay from CustomerRecords where email is not null")

# from the emailAndBirthDayStreamingDF dataframe select the email and the birth year (using the split function)

# Splitting the birth year as a separate field from the birthday
# Selecting only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
#TO-DO: using the redisMessageSchema StructType, deserialize the JSON from the streaming dataframe
redisServerStreamingDF = redisServerStreamingDF.withColumn('value', from_json('value', redisMessageSchema)) \
    .select(col('value.*'))

# TO-DO: create a temporary streaming view called "RedisData" based on the streaming dataframe
# it can later be queried with spark.sql
redisServerStreamingDF.createOrReplaceTempView('RedisData')

#TO-DO: using spark.sql, select key, zSetEntries[0].element as customerLocation from RedisData
zSetEntriesEncodedStreamingDF = spark.sql(
    'select key, zSetEntries[0].element as customerLocation from RedisData')

#TO-DO: from the dataframe use the unbase64 function to select a column called customerLocation with the base64 decoded JSON, and cast it to a string
zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn(
    'customerLocation',
    unbase64(zSetEntriesEncodedStreamingDF.customerLocation).cast('string'))

#TO-DO: using the customer location StructType, deserialize the JSON from the streaming dataframe, selecting column customerLocation.* as a temporary view called CustomerLocation
zSetEntriesDecodedStreamingDF\
    .withColumn('customerLocation', from_json('customerLocation', customerLocationSchema)) \
    .select(col('customerLocation.*')) \
    .createOrReplaceTempView('CustomerLocation')

#TO-DO: using spark.sql select * from CustomerLocation
customerLocationStreamingDF = spark.sql('select * from CustomerLocation')

# TO-DO: write the stream to the console, and configure it to run indefinitely, the console output will look something like this:
# +-------------+---------+
# |accountNumber| location|
# +-------------+---------+
# |         null|     null|
예제 #25
0
# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
# +--------------------+
#
# with this JSON format:
# {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}

customerEncodedEntriesStreamingDF = spark.sql("""
    select zSetEntries[0].element as encodedCustomer from RedisSortedSet
    """)

customerDecodedEntriesStreamingDF = customerEncodedEntriesStreamingDF \
    .withColumn("customer", F.unbase64(customerEncodedEntriesStreamingDF.encodedCustomer).cast("string"))

customerDecodedEntriesStreamingDF.withColumn("customer", F.from_json("customer", redisCustomerSchema)) \
    .select(F.col("customer.*")) \
    .createOrReplaceTempView("CustomerRecords")

emailAndBirthDayStreamingDF = spark.sql("""
    select * from CustomerRecords where email is not null AND birthDay is not null
""")

# parse the JSON in the Customer record and store in a temporary view called CustomerRecords

# JSON parsing will set non-existent fields to null, so let"s
# select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF

customerEncodedEntriesStreamingDF = spark.sql("""
예제 #26
0
    .option("startingOffsets", "earliest") \
    .load()

redis_server_raw_df = redis_server_raw_df.selectExpr(
    "cast(key as string) key", "cast(value as string) value")
redis_server_raw_df\
    .withColumn("value", from_json("value", redis_server_schema))\
    .select(col("value.*"))\
    .createOrReplaceTempView("RedisSortedSet")

customers_df = spark.sql(
    "select key, zSetEntries[0].element as encodedCustomer from RedisSortedSet"
)
customers_df = customers_df.withColumn(
    "encodedCustomer",
    unbase64(customers_df.encodedCustomer).cast("string"))

customers_df\
    .withColumn("encodedCustomer", from_json("encodedCustomer", customer_schema))\
    .select(col("encodedCustomer.*"))\
    .createOrReplaceTempView("CustomerRecords")

emailAndBirthDayStreamingDF = spark.sql(
    "select email, birthDay from CustomerRecords "
    "where email is not null and birthDay is not null")

emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF\
    .select("email", split(col("birthDay"), "-").getItem(0).alias("birthYear"))
emailAndBirthYearStreamingDF.writeStream.format("console").outputMode(
    "append").start().awaitTermination()
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
#+--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF\
    .withColumn("encodedCustomer", unbase64(zSetEntriesEncodedStreamingDF.encodedCustomer).cast("string"))
# Parse the JSON in the Customer record and store in a temporary view called CustomerRecords
zSetEntriesDecodedStreamingDF.withColumn(
    "decodedCustomer",
    from_json("encodedCustomer", customerJSONSchema)).select(
        col("decodedCustomer.*")).createOrReplaceTempView("CustomerRecords")

# JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql(
    "SELECT customerName, email, birthDay FROM CustomerRecords WHERE birthDay IS NOT NULL"
)

# Split the birth year as a separate field from the birthday
relevantEmailAndBirthDayStreamingDF = emailAndBirthDayStreamingDF.select(
    'customerName', 'email', 'birthDay',
    split(emailAndBirthDayStreamingDF.birthDay,
# TO-DO: take the encodedCustomer column which is base64 encoded at first like this:
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
#+--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}
zSetEntriesDecodedStreamingDF = zSetEntriesEncodedStreamingDF.withColumn("customer", unbase64(zSetEntriesEncodedStreamingDF.customer).cast("string"))
# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords
zSetEntriesDecodedStreamingDF.withColumn("customer", from_json("customer", customerJSONSchema)).select(col("customer.*")).createOrReplaceTempView("CustomerRecords")
# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF
emailAndBirthDayStreamingDF = spark.sql("select email, birthDay from CustomerRecords where email is not null and birthDay is not null")
# TO-DO: Split the birth year as a separate field from the birthday
# TO-DO: Select only the birth year and email fields as a new streaming data frame called emailAndBirthYearStreamingDF
emailAndBirthYearStreamingDF = emailAndBirthDayStreamingDF.select("email", split("birthDay", "-").getItem(0).alias("birthYear"))
# TO-DO: using the spark application object, read a streaming dataframe from the Kafka topic stedi-events as the source
# Be sure to specify the option that reads all the events from the topic including those that were published before you started the spark stream
stediEventsRawStreamingDF = spark.readStream.format("kafka")\
                                            .option("kafka.bootstrap.servers", "kafka:19092")\
                                            .option("subscribe", "stedi-events")\
                                            .option("startingOffsets", "earliest")\
                                            .load()
# TO-DO: cast the value column in the streaming dataframe as a STRING 
    .options("subscribe", "redis-server")
    .options("startingOffsets", "earliest")\
    .load()

# TO-DO: cast the value column in the streaming dataframe as a STRING 

RedisServerStreamingDF = redisServerRawStreamingDF.selectExpr("cast(key as String) key", "cast(value as string) value")


RedisServerStreamingDF.withColumn("value",from_json("value", redisMessageSchema))\
    .select("value.*")\
    .createOrReplaceTempView("RedisData")

zSetEntriesEncodedStreamingDF = spark.sql("select key, zSetEntries[0].element as redisEvent from RedisData")

zSetDecodedEntriesStreamingDF1 = zSetEntriesEncodedStreamingDF.withColumn("redisEvent", unbase64(zSetEntriesEncodedStreamingDF.redisEvent).cast("string"))


# TO-DO:; parse the single column "value" with a json object in it, like this:
# +------------+
# | value      |
# +------------+
# |{"key":"Q3..|
# +------------+
#
# with this JSON format: {"key":"Q3VzdG9tZXI=",
# "existType":"NONE",
# "Ch":false,
# "Incr":false,
# "zSetEntries":[{
# "element":"eyJjdXN0b21lck5hbWUiOiJTYW0gVGVzdCIsImVtYWlsIjoic2FtLnRlc3RAdGVzdC5jb20iLCJwaG9uZSI6IjgwMTU1NTEyMTIiLCJiaXJ0aERheSI6IjIwMDEtMDEtMDMifQ==",
예제 #30
0
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+

# and convert it to clear json like this:
# +--------------------+
# |            customer|
# +--------------------+
# |{"customerName":"...|
#+--------------------+
#
# with this JSON format: {"customerName":"Sam Test","email":"*****@*****.**","phone":"8015551212","birthDay":"2001-01-03"}

decoded = unbase64(
    zSetEntriesEncodedStreamingDF.encodedCustomer).cast("string")

zSetDecodedEntriesStreamingDF = zSetEntriesEncodedStreamingDF.withColumn(
    "encodedCustomer", decoded)

# TO-DO: parse the JSON in the Customer record and store in a temporary view called CustomerRecords

zSetDecodedEntriesStreamingDF\
    .withColumn("encodedCustomer", from_json("encodedCustomer", customerSchema))\
    .select(col('encodedCustomer.*'))\
    .createOrReplaceTempView("CustomerRecords")

# TO-DO: JSON parsing will set non-existent fields to null, so let's select just the fields we want, where they are not null as a new dataframe called emailAndBirthDayStreamingDF

emailAndBirthDayStreamingDF = spark.sql(
    "select customerName, email, birthDay from CustomerRecords where email IS NOT NULL and birthDay IS NOT NULL"