def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply quality cuts logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint' # Connect to the TMP science database df = connect_to_raw_database( scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' # Retrieve time-series information to_expand = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Append temp columns with historical + current measurements prefix = 'c' for colname in to_expand: df = concat_col(df, colname, prefix=prefix) # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457 for colname in to_expand: df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c') broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf' # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", checkpointpath_kafka + topicname)\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path rawdatapath = args.online_data_prefix + '/raw' scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint' df = connect_to_raw_database( rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Apply quality cuts logger.info("Applying quality cuts") df = df\ .filter(df['candidate.nbad'] == 0)\ .filter(df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", checkpointpath_sci_tmp) \ .option("path", scitmpdatapath)\ .partitionBy("year", "month", "day") \ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the TMP science database df = connect_to_raw_database(args.scitmpdatapath, args.scitmpdatapath + "/*", latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = userfilter.split('.')[-1] # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", args.checkpointpath_kafka + topicname)\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply level one processor: cdsxmatch logger.info("New processor: cdsxmatch") colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] df = df.withColumn(cdsxmatch.__name__, cdsxmatch(*colnames)) # Apply level one processor: rfscore logger.info("New processor: rfscore") # Required alert columns what = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Use for creating temp name prefix = 'c' what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements for colname in what: df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification. # Note we can omit the model_path argument, and in that case the # default model `data/models/default-model.obj` will be used. rfscore_args = [F.col(i) for i in what_prefix] df = df.withColumn(rfscore.__name__, rfscore(*rfscore_args)) # Apply level one processor: rfscore logger.info("New processor: microlensing") # Retrieve schema schema = load_mulens_schema_twobands() # Create standard UDF mulens_udf = F.udf(mulens, schema) # Required alert columns - already computed for SN what_prefix_mulens = [ 'cfid', 'cmagpsf', 'csigmapsf', 'cmagnr', 'csigmagnr', 'cmagzpsci', 'cisdiffpos' ] mulens_args = [F.col(i) for i in what_prefix_mulens] df = df.withColumn('mulens', mulens_udf(*mulens_args)) # Drop temp columns df = df.drop(*what_prefix) # Partition the data hourly df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .withColumn("hour", F.date_format("timestamp", "HH")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day", "hour") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Not very satisfactory... The problem is that latesfirst = false is # required to create new HBase table (i.e. all the time in the CI). # If you have a better idea, let me know! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply level one filters logger.info(filter_levelone_names) df = apply_user_defined_filters(df, filter_levelone_names) # Apply level one processors logger.info(processor_levelone_names) df = apply_user_defined_processors(df, processor_levelone_names) # Select alert data + timestamp + added value from processors new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"] for i in processor_levelone_names: new_colnames.append(i) df = df.selectExpr(new_colnames) df_hbase = flattenstruct(df, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor(df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds groupedquery_started = False if "cross_match_alerts_per_batch" in processor_levelone_names: df_group = df.groupBy("cross_match_alerts_per_batch").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() groupedquery_started = True # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() if groupedquery_started: groupquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Grab the running Spark Session, # otherwise create it. spark = init_sparksession( name="buildSciDB", shuffle_partitions=2, log_level="ERROR") # FIXME! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database( args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply filters and keep only good alerts df_filt = df.withColumn( "toKeep", keep_alert_based_on( col("decoded.candidate.nbad"), col("decoded.candidate.rb"), col("decoded.candidate.magdiff") ) ).filter("toKeep == true") # for good alerts, perform a cross-match with SIMBAD, # and return the types of the objects (Star, AGN, Unknown, etc.) df_type = df_filt.withColumn( "simbadType", cross_match_alerts_per_batch( col("decoded.objectId"), col("decoded.candidate.ra"), col("decoded.candidate.dec") ) ).selectExpr( "decoded.*", "cast(timestamp as string) as timestamp", "simbadType") df_hbase = flattenstruct(df_type, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor( df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds df_group = df_type.groupBy("simbadType").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() groupquery.stop() print("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()