def test2(spark): """ Stream to stream join based on a timestamp range """ schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "sensors").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.drop('raw_event', 'event_string', 'event') df = df.withWatermark('timestamp', '60 second') df = df.withColumnRenamed('device_id', 'device_id_1') df = df.withColumnRenamed('timestamp', 'timestamp_1') df1 = df df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "sensors2").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.drop('raw_event', 'event_string', 'event') df = df.withWatermark('timestamp', '60 second') df = df.withColumnRenamed('device_id', 'device_id_2') df = df.withColumnRenamed('timestamp', 'timestamp_2') df2 = df df = df1.join( df2, expr('device_id_1 = device_id_2 and ' 'timestamp_1 >= timestamp_2 and ' 'timestamp_1 < timestamp_2 + interval 2 second')) df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def test2(spark): """ """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) df = df.withWatermark('timestamp', '60 second') def f(batch_df, batch_id): print('batch_id=%d' % batch_id) png0 = batch_df.select('data').limit(1).collect()[0][0] print('png0=%s' % png0[0:20]) # IPython.display.clear_output(wait=True) # IPython.display.display(IPython.display.Image(data=png0)) (df.writeStream.trigger(processingTime='3 seconds') # limit trigger rate .foreachBatch(f).start().awaitTermination())
def test11(spark): # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, chunk int, num_chunks int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "video").load()) # Decode JSON event. df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select( '*', from_json('event_string', schema=schema, options=dict(mode='FAILFAST')).alias('event')) df = df.select('*', 'event.*') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def test(spark): """ This demonstrates reading JSON events from Pravega. """ schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') checkpoint_location = os.getenv( 'CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_test_sensor_processor') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "sensors").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.drop('raw_event', 'event_string', 'event') df = df.withWatermark('timestamp', '60 second') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').option('checkpointLocation', checkpoint_location).start().awaitTermination())
def main(): """ Read sensor values from a Pravega stream, randomly reorder them, and write to JSON files. These JSON files can then be used to training a machine learning model. """ print(sys.version) spark = (SparkSession.builder.appName('test1').getOrCreate()) spark.conf.set('spark.sql.shuffle.partitions', '2') spark.conf.set('spark.sql.execution.arrow.enabled', 'true') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') output_dir = '/tmp/sensor_training_data' df = ( spark.read.format("pravega").option("controller", controller).option( "scope", scope).option("stream", "sensors") # .option("encoding", "chunked_v1") .load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) schema = 'timestamp timestamp, event_type string, device_id string, temp_celsius double' df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.drop('raw_event', 'event_string', 'event') #df.limit(5).show() (df.orderBy( rand(seed=1)).write.mode('overwrite').format('json').save(output_dir))
def test14(spark): # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) # Decode JSON event. df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select( '*', from_json('event_string', schema=schema, options=dict(mode='FAILFAST')).alias('event')) df = df.select('*', 'event.*') df = df.withWatermark('timestamp', '60 second') @udf(returnType=BinaryType()) def parse_checksum(checksum_and_data): return checksum_and_data[0:4] @udf(returnType=BinaryType()) def parse_data(checksum_and_data): return checksum_and_data[4:] @udf(returnType=BooleanType()) def is_checksum_correct(checksum, data): expected = struct.unpack('!I', checksum)[0] calculated = zlib.crc32(data) # print('expected=%d, calculated=%d' % (expected, calculated)) return expected == calculated df = df.withColumnRenamed('data', 'checksum_and_data') df = df.select('*', parse_checksum('checksum_and_data').alias('checksum'), parse_data('checksum_and_data').alias('data')) df = df.select( '*', is_checksum_correct('checksum', 'data').alias('is_checksum_correct')) df = df.select('*', length('data')) df = df.drop('raw_event', 'event_string', 'event', 'checksum_and_data', 'data') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def main(topic): # get stream messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', '199.60.17.210:9092,199.60.17.193:9092') \ .option('subscribe', topic).load() # get values and decode df_base = messages.select( functions.decode(messages['value'], 'utf-8').alias('value')) # split dataframe column to (x, y) df_cols = functions.split(df_base['value'], ' ') df_xy = df_base.withColumn('x', df_cols.getItem(0)) \ .withColumn('y', df_cols.getItem(1)) # compute x, y, xy, x^2 df_main = df_xy.select(df_xy['x'], df_xy['y'], (df_xy['x'] * df_xy['y']).alias('xy'), (df_xy['x'] * df_xy['x']).alias('x_sq')) # compute sigma values (n, x, y, xy, x^2) df_sigmas = df_main.select( functions.count(df_main['x']).alias('n'), \ functions.sum(df_main['x']).alias('x'), \ functions.sum(df_main['y']).alias('y'), \ functions.sum(df_main['xy']).alias('xy'), \ functions.sum(df_main['x_sq']).alias('x_sq') \ ) # compute value for beta df_Beta = df_sigmas.select( df_sigmas['*'], \ ( \ (df_sigmas['xy'] - (1 / df_sigmas['n']) * (df_sigmas['x'] * df_sigmas['y'])) / \ (df_sigmas['x_sq'] - (1 / df_sigmas['n']) * (df_sigmas['x'] * df_sigmas['x'])) \ ).alias('beta')) # compute value for alpha df_result = df_Beta.select( \ df_Beta['beta'], \ ( \ (df_Beta['y'] / df_Beta['n']) - (df_Beta['beta'] * (df_Beta['x'] / df_Beta['n'])) \ ).alias('alpha')) # write to output stream = df_result.writeStream.outputMode("complete").format( "console").start() stream.awaitTermination(600)
def test1(spark): """ This demonstrates reading large images from Pravega and detecting defects. The data field contains a base-64 encoded PNG image file. It uses chunked encoding to support events of 2 GiB. This runs out of memory because the non-Pandas runner uses fixed batches of 100. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1").load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) # df = df.withWatermark('timestamp', '60 second') @udf(returnType=DoubleType()) def defect_probability(data): """Calculate the probability of a defect.""" # Decode the image. rgb = cv2.imdecode(np.array(data), -1) # Perform a computation on the image to determine the probability of a defect. # For now, we just calculate the mean pixel value. # We can any Python library, including NumPy and TensorFlow. p = rgb.mean() / 255.0 return float(p) df = df.select('*', defect_probability('data').alias('defect_probability')) df = df.drop('raw_event', 'event_string', 'event', 'data') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
def load_images(filenames_pattern, train_size=1.): """ Using Spark new built-in data source for images, we want to load data into Dataframes in order to pass them in a computing statistics pipeline. Args: filenames_pattern : A string representing path pattern of each image train_size : float number representing the train size (use Dataframe.split([train_size, 1 - train_size])) """ struct_keys = ["origin", "height", "width", "nChannels", "mode", "data"] df = spark.read.load(filenames_pattern, format="image") new_cols = [df["image"].getField(alpha) for alpha in struct_keys] new_frame = df.select(*new_cols) a = new_frame.withColumn("image.data", F.decode(new_frame.image.data,'UTF-8'))\ .drop("image.data")\ .withColumnRenamed("image.data", "data") a.describe().show() a.printSchema() return a
def clean_MES(df): df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8')) return flatten_df( flatten_df( df_decoded.withColumn( 'Body', from_json( col('Body'), StructType([ StructField("dataItemType", StringType(), True), StructField("assetId", StringType(), True), StructField("value", StringType(), True) ], )))).drop(col('SystemProperties')). withColumn( 'SystemProperties_connectionAuthMethod', from_json( col('SystemProperties_connectionAuthMethod'), StructType([ StructField("scope", StringType(), True), StructField("type", StringType(), True), StructField("issuer", StringType(), True), StructField("acceptingIpFilterRule", StringType(), True) ], ))).withColumn( 'Body_Value', from_json( col('Body_Value'), StructType([ StructField("eventId", StringType(), True), StructField("assetId", StringType(), True), StructField("telemetryValue", StringType(), True), StructField("description", StringType(), True), StructField("dateTime", StringType(), True), StructField("componentName", StringType(), True), StructField("status", StringType(), True) ], ))))
def test_create_thumbnails(spark): """ This demonstrates reading large images from Pravega and detecting defects. The data field contains a base-64 encoded PNG image file. It uses chunked encoding to support events of 2 GiB. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') checkpoint_location = os.getenv( 'CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_test_video_and_sensor_processor') shutil.rmtree(checkpoint_location, ignore_errors=True) df = ( spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1") # .option("start_stream_cut", "earliest") .load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) df = df.withWatermark('timestamp', '1 second') df = df.drop('raw_event', 'event_string', 'event') grp = df.groupby( # window('timestamp', '1 second'), 'frame_number', ) #df = df.agg(func.collect_list(func.array(df['camera'], df['data'])).alias('cameras')) @pandas_udf(returnType='frame_number int, data binary', functionType=PandasUDFType.GROUPED_MAP) def combine_thumbnails(df): """Input is a Pandas dataframe with 1 row per camera and frame. Output should be a Pandas dataframe with 1 row per frame.""" print(f'combine_thumbnails: s={df}') df.info(verbose=True) return df[['frame_number', 'data']] # @pandas_udf(returnType=DoubleType(), functionType=PandasUDFType.SCALAR) # def combine_thumbnails(s): # print(f'combine_thumbnails: s={s}') # def f(data): # print('combine_thumbnails: data') # # # Decode the image. # # numpy_array = np.frombuffer(data, dtype='uint8') # # rgb = cv2.imdecode(numpy_array, -1) # # # Perform a computation on the image to determine the probability of a defect. # # # For now, we just calculate the mean pixel value. # # # We can use any Python library, including NumPy and TensorFlow. # # p = rgb.mean() / 255.0 # return 3.14 # return s.apply(f) df = grp.apply(combine_thumbnails) # df = df.select('*', combine_thumbnails('cameras').alias('combined')) df = df.select( func.to_json(func.struct(df["frame_number"], df["data"])).alias("event")) df.printSchema() if False: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'true').option('checkpointLocation', checkpoint_location).start().awaitTermination()) else: (df.writeStream.trigger(processingTime="3 seconds").outputMode( "append").format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "combinedvideo").option( "checkpointLocation", "/tmp/spark-checkpoints-combine_thumbnails").start(). awaitTermination())
def test_detect_defect(spark): """ This demonstrates reading large images from Pravega and detecting defects. The data field contains a base-64 encoded PNG image file. It uses chunked encoding to support events of 2 GiB. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') checkpoint_location = os.getenv( 'CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_test_video_and_sensor_processor') df = ( spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1") # .option("start_stream_cut", "earliest") .load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) df = df.withWatermark('timestamp', '60 second') @pandas_udf(returnType=DoubleType(), functionType=PandasUDFType.SCALAR) def defect_probability(s): """Calculate the probability of a defect.""" def f(data): # Decode the image. numpy_array = np.frombuffer(data, dtype='uint8') rgb = cv2.imdecode(numpy_array, -1) # Perform a computation on the image to determine the probability of a defect. # For now, we just calculate the mean pixel value. # We can use any Python library, including NumPy and TensorFlow. p = rgb.mean() / 255.0 return p return s.apply(f) df = df.select('*', defect_probability('data').alias('defect_probability')) df = df.drop('raw_event', 'event_string', 'event', 'data') df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').option('checkpointLocation', checkpoint_location).start().awaitTermination())
nested_cols = [c[0] for c in df.dtypes if c[1][:6] == "struct"] columns.extend(flat_cols) for nested_col in nested_cols: projected_df = df.select(nested_col + ".*") stack.append((parents + (nested_col, ), projected_df)) return nested_df.select(columns) # COMMAND ---------- from pyspark.sql.functions import unbase64, lit, decode, from_json, col from pyspark.sql.types import StringType, MapType, StructType, StructField df_decoded = df.withColumn('Body', decode(unbase64(df.Body), 'utf-8')) df_decoded_flat = flatten_df( df_decoded.withColumn( 'Body', from_json( col('Body'), StructType([ StructField("dataItemType", StringType(), True), StructField("assetId", StringType(), True), StructField("value", StringType(), True) ], )))).drop(col('SystemProperties')).withColumn( 'SystemProperties_connectionAuthMethod', from_json( col('SystemProperties_connectionAuthMethod'), StructType([ StructField("scope", StringType(), True),
def run(spark): """ This is an attempt at combining multiple video sources into a grid of images. WARNING: This is broken because Spark is not maintaining the time order of the images. This file has been superceded by the Flink/Java class MultiVideoGridJob in the flinkprocessor directory. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') checkpoint_location = os.getenv('CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_multi_video_grid') shutil.rmtree(checkpoint_location, ignore_errors=True) df = ( spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1") # .option("start_stream_cut", "earliest") .load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) fps = 2.0 df = df.selectExpr( '*', f'timestamp(floor(cast(timestamp as double) * {fps}) / {fps}) as discrete_timestamp' ) df = df.withWatermark('discrete_timestamp', '5 second') df = df.drop('raw_event', 'event_string', 'event') thumbnail_size = (84, 84) @pandas_udf(returnType='binary', functionType=PandasUDFType.SCALAR) def decode_and_scale_image(data_series, ssrc): def f(data): in_pil = Image.open(io.BytesIO(data)) out_pil = in_pil.resize(thumbnail_size) return out_pil.tobytes() return data_series.apply(f) df = df.select( '*', decode_and_scale_image(df['data'], df['ssrc']).alias('image')) df = df.select( '*', func.to_json( func.struct(df['discrete_timestamp'], df['frame_number'], df['camera'])).alias('json')) df = df.repartition(1) grp = df.groupby( # window('timestamp', '1 second'), 'discrete_timestamp', ) @pandas_udf( returnType= 'timestamp timestamp, frame_number int, ssrc int, data binary, source string', functionType=PandasUDFType.GROUPED_MAP) def combine_images_into_grid(df): # TODO: This Pandas UDF provides incorrect results because it is called before the aggregation is finalized by the watermark. if df.empty: return None row0 = df.iloc[0] num_cameras = df.camera.max() + 1 grid_count = math.ceil(math.sqrt(num_cameras)) # Determine number of images per row and column. image_width = thumbnail_size[0] image_height = thumbnail_size[1] image_mode = 'RGB' margin = 1 status_width = 0 # Create blank output image, white background. out_pil = Image.new( 'RGB', ((image_width + margin) * grid_count - margin + status_width, (image_height + margin) * grid_count - margin), (128, 128, 128)) # Add images from each camera def add_image(r): # in_pil = Image.open(io.BytesIO(r['image'])) in_pil = Image.frombytes(image_mode, (image_width, image_height), r['image']) x = (r['camera'] % grid_count) * (image_width + margin) y = (r['camera'] // grid_count) * (image_width + margin) out_pil.paste(in_pil, (x, y)) df.apply(add_image, axis=1) # font = ImageFont.truetype('/usr/share/fonts/truetype/freefont/FreeSans.ttf', font_size) # draw = ImageDraw.Draw(img) # draw.text((status_width, 0), 'FRAME\n%05d\nCAMERA\n %03d' % (frame_number, camera), font=font, align='center') out_bytesio = io.BytesIO() out_pil.save(out_bytesio, format='PNG', compress_level=0) out_bytes = out_bytesio.getvalue() new_row = pd.Series() new_row['timestamp'] = row0['discrete_timestamp'] new_row['ssrc'] = 0 new_row['frame_number'] = 0 new_row['source'] = df[['camera', 'frame_number', 'timestamp']].to_json() new_row['data'] = out_bytes # new_row['data'] = b'' return pd.DataFrame([new_row]) # @pandas_udf(returnType='string', functionType=PandasUDFType.SCALAR) # def combine_images_into_grid2(json): # # TODO # def f(data): # in_pil = Image.open(io.BytesIO(data)) # out_pil = in_pil.resize(thumbnail_size) # return out_pil.tobytes() # return data_series.apply(f) df = grp.apply(combine_images_into_grid) df = df.select( func.to_json(func.struct(df["frame_number"], df["data"])).alias("event")) # df = grp.agg(func.collect_list('json')) # df = df.selectExpr('*', '0 as ssrc') # window = Window.partitionBy('ssrc').orderBy('discrete_timestamp').rowsBetween(Window.unboundedPreceding, Window.currentRow) # df = df.select('*', func.row_number().over(window)) # TODO: Output rows are not written in timestamp order. How can this be fixed? # Below gives error: Sorting is not supported on streaming DataFrames/Datasets, unless it is on aggregated DataFrame/Dataset in Complete output mode # df = df.sortWithinPartitions(df['discrete_timestamp']) df.printSchema() if False: (df.writeStream # .trigger(processingTime='1000 milliseconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').option('checkpointLocation', checkpoint_location).start().awaitTermination()) else: (df.writeStream.trigger(processingTime="1000 milliseconds").outputMode( "append").format("pravega").option( "controller", controller).option( "scope", scope).option("stream", "combinedvideo").option( "checkpointLocation", checkpoint_location).start().awaitTermination())
def test12(spark): # ssrc is the synchronization source identifier. See https://en.wikipedia.org/wiki/Real-time_Transport_Protocol. # It should be selected at random by each process that writes records. schema = 'timestamp timestamp, frame_number int, camera int, chunk int, num_chunks int, ssrc int, data binary' controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') df = (spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option("stream", "video").load()) # Decode JSON event. df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select( '*', from_json('event_string', schema=schema, options=dict(mode='FAILFAST')).alias('event')) df = df.select('*', 'event.*') df = df.withWatermark('timestamp', '60 second') # The number of chunks must be fixed for the entire Spark job because it determines the number of joins. num_chunks = 3 # Ignore any records with a different number of chunks. Perhaps these can be sent to an error stream. df = df.filter(df.num_chunks == num_chunks) # Create a dataframe for each chunk. chunk_dfs = [ df.filter(df.chunk == chunk_index).drop('chunk').withColumnRenamed( 'data', 'data%d' % chunk_index) for chunk_index in range(num_chunks) ] # Join chunks. df = chunk_dfs[0] for chunk_id in range(1, num_chunks): df = df.join(chunk_dfs[chunk_id], ['timestamp', 'camera', 'ssrc'], 'inner') # Concatenate binary data. data_cols = ['data%d' % chunk_index for chunk_index in range(num_chunks)] df = df.select('timestamp', 'camera', 'ssrc', concat(*data_cols).alias('data')) # Deduplication. df = df.dropDuplicates(['timestamp', 'camera']) @udf(returnType=BinaryType()) def parse_checksum(checksum_and_data): return checksum_and_data[0:4] @udf(returnType=BinaryType()) def parse_data(checksum_and_data): return checksum_and_data[4:] @udf(returnType=BooleanType()) def is_checksum_correct(checksum, data): expected = struct.unpack('!I', checksum)[0] calculated = zlib.crc32(data) print('expected=%d, calculated=%d' % (expected, calculated)) return expected == calculated df = df.withColumnRenamed('data', 'checksum_and_data') df = df.select('*', parse_checksum('checksum_and_data').alias('checksum'), parse_data('checksum_and_data').alias('data')) df = df.select( '*', is_checksum_correct('checksum', 'data').alias('is_checksum_correct')) # df = df.filter(df.is_checksum_correct == True) df.printSchema() if True: (df.writeStream.trigger( processingTime='3 seconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').start().awaitTermination())
if __name__ == '__main__': spark = pyspark.sql.SparkSession \ .builder \ .appName("StructuredNetworkWordCount") \ .getOrCreate() df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092,localhost:9093,localhost:9094") \ .option("subscribe", "la-crime") \ .load() from pyspark.sql.functions import get_json_object, decode df_string = df.select(decode(df.value, 'UTF-8').alias('json')) # binary to UTF-8 crime_types = df_string.select( get_json_object(df_string.json, '$.Crime Code Description').alias('types')) crime_types_count = crime_types.groupBy("types").count().orderBy( 'count', ascending=False) # .limit(5) query = crime_types_count\ .writeStream \ .outputMode("complete")\ .format("console") \ .start() query.awaitTermination()