def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint' # Connect to the TMP science database df = connect_to_raw_database( scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' # Retrieve time-series information to_expand = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Append temp columns with historical + current measurements prefix = 'c' for colname in to_expand: df = concat_col(df, colname, prefix=prefix) # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457 for colname in to_expand: df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c') broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf' # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", checkpointpath_kafka + topicname)\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Read the catalog file generated by raw2science science_db_catalog = args.science_db_catalog with open(science_db_catalog) as f: catalog = json.load(f) # Define variables min_timestamp = 100 # set a default t_end = 1577836799 # some default value # get distribution offset min_timestamp = get_distribution_offset( args.checkpointpath_dist, args.startingOffset_dist) # Get topic name to publish on topic = args.distribution_topic broker_list = args.distribution_servers # Run distribution for (args.exit_after) seconds if args.exit_after is not None: t_end = time.time() + args.exit_after exit_after = True else: exit_after = False # Start the distribution service while(not exit_after or time.time() < t_end): """Keep scanning the HBase for new records in a loop """ # Scan the HBase till current time max_timestamp = int(round(time.time() * 1000)) # time in ms # Read Hbase within timestamp range df = spark.read\ .option("catalog", catalog)\ .option("minStamp", min_timestamp)\ .option("maxStamp", max_timestamp)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() # Keep records that haven't been distributed df = df.filter("status!='distributed'") # Send out slack alerts api_token = get_api_token() if api_token: slack_cols = [ "objectId", "candidate_ra", "candidate_dec", "cross_match_alerts_per_batch"] send_slack_alerts(df.select(slack_cols), args.slack_channels) # Apply additional filters (user defined xml) if args.distribution_rules_xml: df = filter_df_using_xml(df, args.distribution_rules_xml) # create a nested dataframe similar to the original ztf dataframe df_nested = group_df_into_struct(df, "candidate", "objectId") df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId") # Apply level two filters df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names) # Persist df to memory to materialize changes df_nested.persist() # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_nested, args.distribution_schema) # Ensure that the topic(s) exist on the Kafka Server) df_kafka\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topic)\ .save() # Update the status in Hbase and commit checkpoint to file update_status_in_hbase( df, args.science_db_name, "objectId", args.checkpointpath_dist, max_timestamp) # update min_timestamp for next iteration min_timestamp = max_timestamp # free the memory df_nested.unpersist() # Wait for some time before another loop time.sleep(1)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the TMP science database df = connect_to_raw_database(args.scitmpdatapath, args.scitmpdatapath + "/*", latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = userfilter.split('.')[-1] # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", args.checkpointpath_kafka + topicname)\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Get or create a Spark Session spark = init_sparksession( name="distribution", shuffle_partitions=2, log_level="ERROR") # Read the catalog file generated by raw2science science_db_catalog = args.science_db_catalog with open(science_db_catalog) as f: catalog = json.load(f) # Define variables min_timestamp = 100 # set a default t_end = 1577836799 # some default value # get distribution offset min_timestamp = get_distribution_offset( args.checkpointpath_dist, args.startingOffset_dist) # Run distribution for (args.exit_after) seconds if args.exit_after is not None: t_end = time.time() + args.exit_after exit_after = True else: exit_after = False # Start the distribution service while(not exit_after or time.time() < t_end): """Keep scanning the HBase for new records in a loop """ # Scan the HBase till current time max_timestamp = int(round(time.time() * 1000)) # time in ms # Read Hbase within timestamp range df = spark.read\ .option("catalog", catalog)\ .option("minStamp", min_timestamp)\ .option("maxStamp", max_timestamp)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() # Filter out records that have been distributed df = df.filter("status!='distributed'") # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df, args.distribution_schema) # Publish Kafka topic(s) (Ensure that the topic(s) exist on the Kafka Server) df_kafka\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", "localhost:9093")\ .option("topic", "distribution_test")\ .save() # Update the status column in Hbase update_status_in_hbase(df, args.science_db_name, "objectId", args.checkpointpath_dist, max_timestamp) # update min_timestamp for next iteration min_timestamp = max_timestamp # Wait for some time before another loop time.sleep(1)