def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="checkstream", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Trigger the streaming computation, # by defining the sink (memory here) and starting it countquery = df \ .writeStream \ .queryName("qraw")\ .format("console")\ .outputMode("update") \ .start() # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, 2, colnames, args.finkwebpath, "live_raw.csv", "live") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the checkstream service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream kerberos = 'public2.alerts.ztf' in args.servers df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False, kerberos=kerberos) # Get Schema of alerts alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) if '134.158.' in args.servers or 'localhost' in args.servers: # using custom from_avro (not available for Spark 2.4.x) # it will be available from Spark 3.0 though df_decoded = df.select( [from_avro(df["value"], alert_schema_json).alias("decoded")]) elif 'public2.alerts.ztf' in args.servers: # Decode on-the-fly using fastavro f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema) df_decoded = df.select([f(df['value']).alias("decoded")]) else: msg = "Data source {} is not known - a decoder must be set".format( args.servers) logger.warn(msg) spark.stop() # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("year", "month", "day") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "live_raw.csv", "live") monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "history.csv", "history") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()