def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session init_sparksession(name="archivingStream", shuffle_partitions=2, log_level="ERROR") # Create a streaming dataframe pointing to a Kafka stream df = connect_with_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath) \ .option("path", args.outputpath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath) # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() print("Exiting the archiving service normally...") else: countquery.awaitTermination()
def decode_kafka_df(df_kafka: DataFrame, schema_path: str) -> DataFrame: """Decode the DataFrame read from Kafka The DataFrame read from Kafka contains the following columns: key: binary value: binary topic: string partition: int offset: long timestamp: long timestampType: integer The value column contains the structured data of the alert encoded into avro(binary). This routine creates a Spark DataFrame with a decoded StructType column using the avro schema at schema_path. Parameters ---------- df_kafka: DataFrame A Spark DataFrame created after reading the Kafka Source schema_path: str Path where the avro schema to decode the Kafka message is stored Returns ---------- df: DataFrame A Spark DataFrame with a StructType Column with decoded data of the avro(binary) column named "value" Examples ---------- >>> df = spark.sparkContext.parallelize(zip( ... ["ZTF18aceatkx", "ZTF18acsbjvw"], ... [697251923115015002, 697251921215010004], ... [20.393772, 20.4233877], ... [-25.4669463, -27.0588511], ... ["Star", "Unknown"])).toDF([ ... "objectId", "candid", "candidate_ra", ... "candidate_dec", "cross_match_alerts_per_batch"]) >>> df.show() +------------+------------------+------------+-------------+----------------------------+ | objectId| candid|candidate_ra|candidate_dec|cross_match_alerts_per_batch| +------------+------------------+------------+-------------+----------------------------+ |ZTF18aceatkx|697251923115015002| 20.393772| -25.4669463| Star| |ZTF18acsbjvw|697251921215010004| 20.4233877| -27.0588511| Unknown| +------------+------------------+------------+-------------+----------------------------+ <BLANKLINE> >>> temp_schema = os.path.join(os.environ["PWD"], "temp_schema") >>> save_avro_schema(df, temp_schema) # Encode the data into avro >>> df_kafka = get_kafka_df(df, '') # Decode the avro df >>> df_decoded = decode_kafka_df(df_kafka, temp_schema) >>> df_decoded.printSchema() root |-- struct: struct (nullable = true) | |-- objectId: string (nullable = true) | |-- candid: long (nullable = true) | |-- candidate_ra: double (nullable = true) | |-- candidate_dec: double (nullable = true) | |-- cross_match_alerts_per_batch: string (nullable = true) <BLANKLINE> >>> df_decoded.select(col("struct.*")).show() +------------+------------------+------------+-------------+----------------------------+ | objectId| candid|candidate_ra|candidate_dec|cross_match_alerts_per_batch| +------------+------------------+------------+-------------+----------------------------+ |ZTF18aceatkx|697251923115015002| 20.393772| -25.4669463| Star| |ZTF18acsbjvw|697251921215010004| 20.4233877| -27.0588511| Unknown| +------------+------------------+------------+-------------+----------------------------+ <BLANKLINE> >>> os.remove(temp_schema) """ # Read the avro schema with open(schema_path) as f: avro_schema = json.dumps(json.load(f)) # Decode the avro(binary) column df = df_kafka.select(from_avro("value", avro_schema).alias("struct")) return df
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream kerberos = 'public2.alerts.ztf' in args.servers df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False, kerberos=kerberos) # Get Schema of alerts alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) if '134.158.' in args.servers or 'localhost' in args.servers: # using custom from_avro (not available for Spark 2.4.x) # it will be available from Spark 3.0 though df_decoded = df.select( [from_avro(df["value"], alert_schema_json).alias("decoded")]) elif 'public2.alerts.ztf' in args.servers: # Decode on-the-fly using fastavro f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema) df_decoded = df.select([f(df['value']).alias("decoded")]) else: msg = "Data source {} is not known - a decoder must be set".format( args.servers) logger.warn(msg) spark.stop() # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("year", "month", "day") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session init_sparksession(name="classifyStream", shuffle_partitions=2, log_level="ERROR") # Create a streaming dataframe pointing to a Kafka stream df = connect_with_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Select only (timestamp, id, ra, dec) df_expanded = df_decoded.select([ df_decoded["timestamp"], df_decoded["decoded.objectId"], df_decoded["decoded.candidate.ra"], df_decoded["decoded.candidate.dec"] ]) # for each micro-batch, perform a cross-match with an external catalog, # and return the types of the objects (Star, AGN, Unknown, etc.) df_type = df_expanded.withColumn( "type", cross_match_alerts_per_batch(col("objectId"), col("ra"), col("dec"))) # Group data by type and count members df_group = df_type.groupBy("type").count() # Update the DataFrame every tinterval seconds countquery_tmp = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv) # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath) # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() print("Exiting the classify service normally...") else: countquery.awaitTermination()