def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Grab the running Spark Session, # otherwise create it. spark = init_sparksession(name="readingScienceDB", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) with open(args.science_db_catalog) as f: catalog = json.load(f) catalog_dic = json.loads(catalog) df = spark.read.option("catalog", catalog)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() print("Number of entries in {}: ".format(catalog_dic["table"]["name"]), df.count()) print( "Number of distinct objects in {}: ".format( catalog_dic["table"]["name"]), df.select('objectId').distinct().count())
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply quality cuts logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = 'ztf_alerts/science_reprocessed' df = spark.read.format('parquet').load(input_raw) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="save_schema_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) df = load_parquet_files(input_science) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index('timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index('cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index('cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index('cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index('prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' df_kafka = df.selectExpr(cnames) path_for_avro = 'new_schema_{}.avro'.format(time()) df_kafka.limit(1).write.format("avro").save(path_for_avro) # retrieve data on local disk subprocess.run(["hdfs", "dfs", '-get', path_for_avro]) # Read the avro schema from .avro file avro_file = glob.glob(path_for_avro + "/part*")[0] avro_schema = readschemafromavrofile(avro_file) # Write the schema to a file for decoding Kafka messages with open('schemas/{}'.format(path_for_avro.replace('.avro', '.avsc')), 'w') as f: json.dump(avro_schema, f, indent=2)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribution_test", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Topic to read from topic = args.distribution_topic broker_list = args.distribution_servers # Read from the Kafka topic df_kafka = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", broker_list) \ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("subscribe", topic) \ .load() # Decode df_kafka into a Spark DataFrame with StructType column df = decode_kafka_df(df_kafka, args.distribution_schema) # Print received stream to the console df = df.select("struct.*") print("\nReading Fink OutStream\n") debug_query = df.writeStream\ .format("console")\ .trigger(processingTime='2 seconds')\ .start() # Keep the Streaming running for some time if args.exit_after is not None: time.sleep(args.exit_after) debug_query.stop() logger.info("Exiting distribution_test service normally...") else: debug_query.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="checkstream", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Trigger the streaming computation, # by defining the sink (memory here) and starting it countquery = df \ .writeStream \ .queryName("qraw")\ .format("console")\ .outputMode("update") \ .start() # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, 2, colnames, args.finkwebpath, "live_raw.csv", "live") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the checkstream service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path rawdatapath = args.online_data_prefix + '/raw' scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint' df = connect_to_raw_database( rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Apply quality cuts logger.info("Applying quality cuts") df = df\ .filter(df['candidate.nbad'] == 0)\ .filter(df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", checkpointpath_sci_tmp) \ .option("path", scitmpdatapath)\ .partitionBy("year", "month", "day") \ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="index_archival_{}_{}".format(args.index_table, args.night), shuffle_partitions=2 ) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8] ) df = load_parquet_files(path) # construct the index view index_row_key_name = args.index_table columns = index_row_key_name.split('_') names = [col(i) for i in columns] index_name = '.' + columns[0] # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. if 'upper' in args.index_table: df = df.select( 'objectId', 'prv_candidates.jd', 'prv_candidates.fid', 'prv_candidates.magpsf', 'prv_candidates.sigmapsf', 'prv_candidates.diffmaglim' ) else: df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, _ = attach_rowkey(df) common_cols = [ 'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr', 'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb', 'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag', 'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos', 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet' ] if columns[0].startswith('pixel'): nside = int(columns[0].split('pixel')[1]) df_index = df.withColumn( columns[0], ang2pix( df['ra'], df['dec'], lit(nside) ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + ['objectId'] ) elif columns[0] == 'class': df_index = df.withColumn( 'class', extract_fink_classification( df['cdsxmatch'], df['roid'], df['mulens'], df['snn_snia_vs_nonia'], df['snn_sn_vs_all'], df['rf_snia_vs_nonia'], df['ndethist'], df['drb'], df['classtar'], df['jd'], df['jdstarthist'], df['rf_kn_vs_nonkn'], df['tracklet'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'ssnamenr': # Flag only objects with likely counterpart in MPC df_index = df\ .filter(df['roid'] == 3)\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'tracklet': # For data < 2021-08-10, no tracklet means '' # For data >= 2021-08-10, no tracklet means 'null' df_index = df\ .filter(df['tracklet'] != 'null')\ .filter(df['tracklet'] != '')\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'upper': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only upper limits df_index = df_ex.filter(~df_ex['magpsf'].isNotNull()) # drop NaN columns df_index = df_index.drop(*['magpsf', 'sigmapsf']) elif columns[0] == 'uppervalid': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only valid measurements from the history df_index = df_ex.filter(df_ex['magpsf'].isNotNull()) elif columns[0] == 'tns': with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker) # Filter TNS confirmed data f1 = ~pdf_tns['type'].isna() pdf_tns_filt = pdf_tns[f1] pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt) @pandas_udf(StringType(), PandasUDFType.SCALAR) def crossmatch_with_tns(objectid, ra, dec): # TNS pdf = pdf_tns_filt_b.value ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type'] # create catalogs catalog_ztf = SkyCoord( ra=np.array(ra, dtype=np.float) * u.degree, dec=np.array(dec, dtype=np.float) * u.degree ) catalog_tns = SkyCoord( ra=np.array(ra2, dtype=np.float) * u.degree, dec=np.array(dec2, dtype=np.float) * u.degree ) # cross-match idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf) sub_pdf = pd.DataFrame({ 'objectId': objectid.values[idx], 'ra': ra.values[idx], 'dec': dec.values[idx], }) # cross-match idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns) # set separation length sep_constraint2 = d2d2.degree < 1.5 / 3600 sub_pdf['TNS'] = [''] * len(sub_pdf) sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]] to_return = objectid.apply( lambda x: '' if x not in sub_pdf['objectId'].values else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0] ) return to_return df = df.withColumn( 'tns', crossmatch_with_tns( df['objectId'], df['ra'], df['dec'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols + ['tns'] ).cache() df_index = df.filter(df['tns'] != '').drop('tns') # trigger the cache - not the cache might be a killer for LSST... n = df_index.count() print('TNS objects: {}'.format(n)) else: df_index = df.select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_index.schema, args.science_db_name + index_name, rowkeyname=index_row_key_name, cf=cf ) # Push index table df_index.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_index = df_index.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_index_schema = construct_schema_row( df_index, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_index_schema.schema, args.science_db_name + index_name, rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_index_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Not very satisfactory... The problem is that latesfirst = false is # required to create new HBase table (i.e. all the time in the CI). # If you have a better idea, let me know! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply level one filters logger.info(filter_levelone_names) df = apply_user_defined_filters(df, filter_levelone_names) # Apply level one processors logger.info(processor_levelone_names) df = apply_user_defined_processors(df, processor_levelone_names) # Select alert data + timestamp + added value from processors new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"] for i in processor_levelone_names: new_colnames.append(i) df = df.selectExpr(new_colnames) df_hbase = flattenstruct(df, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor(df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds groupedquery_started = False if "cross_match_alerts_per_batch" in processor_levelone_names: df_group = df.groupBy("cross_match_alerts_per_batch").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() groupedquery_started = True # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() if groupedquery_started: groupquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=None) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) # data path input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = args.agg_data_prefix + '/science' df = spark.read.format('parquet').load(input_raw) npart = df.rdd.getNumPartitions() # Apply level one filters logger.info(qualitycuts) df = df.filter(df['candidate.nbad'] == 0).filter( df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add tracklet information df_trck = spark.read.format('parquet').load(input_raw) df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter( df_trck['candidate.rb'] >= 0.55) df_trck = add_tracklet_information(df_trck) # join back information to the initial dataframe df = df\ .join( F.broadcast(df_trck.select(['candid', 'tracklet'])), on='candid', how='outer' ) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) df.coalesce(npart).write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="TNS_report_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8]) df = load_parquet_files(path) with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') if not args.tns_sandbox: print("WARNING: submitting to real (not sandbox) TNS website") if args.tns_sandbox: url_tns_api = "https://sandbox.wis-tns.org/api" with open('{}/sandbox-tns_api.key'.format(args.tns_folder)) as f: # remove line break... key = f.read().replace('\n', '') else: url_tns_api = "https://www.wis-tns.org/api" with open('{}/tns_api.key'.format(args.tns_folder)) as f: # remove line break... key = f.read().replace('\n', '') cols = [ 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'candidate.ndethist', 'candidate.drb', 'candidate.classtar', 'candidate.jd', 'candidate.jdstarthist', 'rf_kn_vs_nonkn', 'tracklet' ] df = df.withColumn('class', extract_fink_classification(*cols)) pdf = df\ .filter(df['class'] == 'Early SN candidate')\ .filter(df['candidate.ndethist'] <= 20)\ .toPandas() pdf_unique = pdf.groupby('objectId')[pdf.columns].min() print("{} new alerts".format(len(pdf))) print("{} new sources".format(len(pdf_unique))) pdf = pdf_unique ids = [] report = {"at_report": {}} check_tns = False for index, row in enumerate(pdf.iterrows()): alert = row[1] past_ids = read_past_ids(args.tns_folder) if alert['objectId'] in past_ids.values: print('{} already sent!'.format(alert['objectId'])) continue if check_tns: groupid = retrieve_groupid(key, tns_marker, alert['objectId']) if groupid > 0: print("{} already reported by {}".format( alert['objectId'], groupid)) else: print('New report for object {}'.format(alert['objectId'])) photometry, non_detection = extract_discovery_photometry(alert) report['at_report']["{}".format(index)] = build_report( alert, photometry, non_detection) ids.append(alert['objectId']) print('new objects: ', ids) if len(ids) != 0: json_report = save_logs_and_return_json_report(name='{}{}{}'.format( args.night[:4], args.night[4:6], args.night[6:8]), folder=args.tns_folder, ids=ids, report=report) r = send_json_report(key, url_tns_api, json_report, tns_marker) print(r.json()) # post to slack slacktxt = ' \n '.join( ['https://fink-portal/{}'.format(i) for i in ids]) slacktxt = '{} \n '.format(args.night) + slacktxt r = requests.post(os.environ['TNSWEBHOOK'], json={ 'text': slacktxt, "username": "******" }, headers={'Content-Type': 'application/json'}) print(r.status_code) else: slacktxt = '{} \n No new sources'.format(args.night) r = requests.post(os.environ['TNSWEBHOOK'], json={ 'text': slacktxt, "username": "******" }, headers={'Content-Type': 'application/json'})
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the TMP science database df = connect_to_raw_database(args.scitmpdatapath, args.scitmpdatapath + "/*", latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = userfilter.split('.')[-1] # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", args.checkpointpath_kafka + topicname)\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="mergeAndClean_{}".format(args.night)) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = '{}/year={}/month={}/day={}'.format(args.rawdatapath, year, month, day) input_science = '{}/year={}/month={}/day={}'.format( args.scitmpdatapath, year, month, day) # basepath output_raw = 'ztf_alerts/raw' output_science = 'ztf_alerts/science' print('Raw data processing....') df_raw = spark.read.format('parquet').load(input_raw) print('Num partitions before: ', df_raw.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_raw)) df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_raw))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_raw) print('Science data processing....') df_science = spark.read.format('parquet').load(input_science) print('Num partitions before: ', df_science.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_science)) df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_science))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science) # Remove temporary alert folder - beware you'll never get it back! if args.fs == 'hdfs': subprocess.run(["hdfs", "dfs", '-rm', '-rf', args.datapath]) elif args.fs == 'local': subprocess.run(['rm', '-rf', args.datapath]) else: print('Filesystem not understood. FS_KIND must be hdfs or local.')
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Read the catalog file generated by raw2science science_db_catalog = args.science_db_catalog with open(science_db_catalog) as f: catalog = json.load(f) # Define variables min_timestamp = 100 # set a default t_end = 1577836799 # some default value # get distribution offset min_timestamp = get_distribution_offset( args.checkpointpath_dist, args.startingOffset_dist) # Get topic name to publish on topic = args.distribution_topic broker_list = args.distribution_servers # Run distribution for (args.exit_after) seconds if args.exit_after is not None: t_end = time.time() + args.exit_after exit_after = True else: exit_after = False # Start the distribution service while(not exit_after or time.time() < t_end): """Keep scanning the HBase for new records in a loop """ # Scan the HBase till current time max_timestamp = int(round(time.time() * 1000)) # time in ms # Read Hbase within timestamp range df = spark.read\ .option("catalog", catalog)\ .option("minStamp", min_timestamp)\ .option("maxStamp", max_timestamp)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() # Keep records that haven't been distributed df = df.filter("status!='distributed'") # Send out slack alerts api_token = get_api_token() if api_token: slack_cols = [ "objectId", "candidate_ra", "candidate_dec", "cross_match_alerts_per_batch"] send_slack_alerts(df.select(slack_cols), args.slack_channels) # Apply additional filters (user defined xml) if args.distribution_rules_xml: df = filter_df_using_xml(df, args.distribution_rules_xml) # create a nested dataframe similar to the original ztf dataframe df_nested = group_df_into_struct(df, "candidate", "objectId") df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId") # Apply level two filters df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names) # Persist df to memory to materialize changes df_nested.persist() # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_nested, args.distribution_schema) # Ensure that the topic(s) exist on the Kafka Server) df_kafka\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topic)\ .save() # Update the status in Hbase and commit checkpoint to file update_status_in_hbase( df, args.science_db_name, "objectId", args.checkpointpath_dist, max_timestamp) # update min_timestamp for next iteration min_timestamp = max_timestamp # free the memory df_nested.unpersist() # Wait for some time before another loop time.sleep(1)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="statistics_{}".format(args.night), shuffle_partitions=2 ) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Statistics for {}/{}/{}'.format(year, month, day)) input_raw = '{}/raw/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) input_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) df_raw = spark.read.format('parquet').load(input_raw) df_sci = spark.read.format('parquet').load(input_science) df_sci = df_sci.cache() # Number of alerts n_raw_alert = df_raw.count() n_sci_alert = df_sci.count() out_dic = {} out_dic['raw'] = n_raw_alert out_dic['sci'] = n_sci_alert # matches with SIMBAD n_simbad = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'] != 'Unknown')\ .count() out_dic['simbad_tot'] = n_simbad # Alerts with a close-by candidate host-galaxy list_simbad_galaxies = [ "galaxy", "Galaxy", "EmG", "Seyfert", "Seyfert_1", "Seyfert_2", "BlueCompG", "StarburstG", "LSB_G", "HII_G", "High_z_G", "GinPair", "GinGroup", "BClG", "GinCl", "PartofG", ] n_simbad_gal = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\ .count() out_dic['simbad_gal'] = n_simbad_gal df_class = df_sci.withColumn( 'class', extract_fink_classification( df_sci['cdsxmatch'], df_sci['roid'], df_sci['mulens'], df_sci['snn_snia_vs_nonia'], df_sci['snn_sn_vs_all'], df_sci['rf_snia_vs_nonia'], df_sci['candidate.ndethist'], df_sci['candidate.drb'], df_sci['candidate.classtar'], df_sci['candidate.jd'], df_sci['candidate.jdstarthist'], df_sci['rf_kn_vs_nonkn'], df_sci['tracklet'] ) ) out_class = df_class.groupBy('class').count().collect() out_class_ = [o.asDict() for o in out_class] out_class_ = [list(o.values()) for o in out_class_] for kv in out_class_: out_dic[kv[0]] = kv[1] # Number of fields n_field = df_sci.select('candidate.field').distinct().count() out_dic['fields'] = n_field # number of measurements per band n_g = df_sci.select('candidate.fid').filter('fid == 1').count() n_r = df_sci.select('candidate.fid').filter('fid == 2').count() out_dic['n_g'] = n_g out_dic['n_r'] = n_r # Number of exposures n_exp = df_sci.select('candidate.jd').distinct().count() out_dic['exposures'] = n_exp out_dic['night'] = 'ztf_{}'.format(args.night) # make a Spark DataFrame pdf = pd.DataFrame([out_dic]) df_hbase = spark.createDataFrame(pdf) # rowkey is the night YYYYMMDD index_row_key_name = 'night' # Columns to use cols_basic = [ 'raw', 'sci', 'night', 'n_g', 'n_r', 'exposures', 'fields' ] cols_class_ = np.transpose(out_class_)[0] cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal'])) # column families cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns} cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns}) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, 'statistics_class', rowkeyname=index_row_key_name, cf=cf ) # Push index table df_hbase.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_hbase = df_hbase.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_hbase_schema = construct_schema_row( df_hbase, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_hbase_schema.schema, 'statistics_class', rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_hbase_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import slack from fink_broker.tester import spark_unit_tests from pyspark.sql import DataFrame from fink_broker.loggingUtils import get_fink_logger logger = get_fink_logger(__name__, "INFO") class FinkSlackClient: def __init__(self, api_token): self._client = slack.WebClient(token=api_token) try: self._client.auth_test() except Exception: logger.error("Authentication Error: Invalid Token") # create a dict of {channelName: ID} channels = self._client.channels_list()['channels'] self._channel_ids = { x['name']: x['id']
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream kerberos = 'public2.alerts.ztf' in args.servers df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False, kerberos=kerberos) # Get Schema of alerts alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) if '134.158.' in args.servers or 'localhost' in args.servers: # using custom from_avro (not available for Spark 2.4.x) # it will be available from Spark 3.0 though df_decoded = df.select( [from_avro(df["value"], alert_schema_json).alias("decoded")]) elif 'public2.alerts.ztf' in args.servers: # Decode on-the-fly using fastavro f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema) df_decoded = df.select([f(df['value']).alias("decoded")]) else: msg = "Data source {} is not known - a decoder must be set".format( args.servers) logger.warn(msg) spark.stop() # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("year", "month", "day") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "live_raw.csv", "live") monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath, "history.csv", "history") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="science_archival_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8]) df = load_parquet_files(path) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, row_key_name = attach_rowkey(df) # construct the hbase catalog hbcatalog = construct_hbase_catalog_from_flatten_schema( df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf) # Save the catalog on disk (local) with open(args.science_db_catalog, 'w') as json_file: json.dump(hbcatalog, json_file) if args.save_science_db_catalog_only: # Print for visual inspection print(hbcatalog) else: # Push the data using the shc connector df.write\ .options(catalog=hbcatalog, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df = df.withColumnRenamed(row_key_name, schema_row_key_name) df_schema = construct_schema_row(df, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format( fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_schema = construct_hbase_catalog_from_flatten_schema( df_schema.schema, args.science_db_name, rowkeyname=schema_row_key_name, cf=cf) # Save the catalog on disk (local) catname = args.science_db_catalog.replace('.json', '_schema_row.json') with open(catname, 'w') as json_file: json.dump(hbcatalog_schema, json_file) # Push the data using the shc connector df_schema.write\ .options(catalog=hbcatalog_schema, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def apply_user_defined_processors(df: DataFrame, processor_names: list): """Apply iteratively user processors to give added values to the stream. Each processor will add one new column to the input DataFrame. The name of the column will be the name of the processor routine. Parameters ---------- df: DataFrame Spark DataFrame with alert data processor_names: list of string List containing processor names to be applied. These processors should come from the fink-science module (see example below). Returns ------- df: DataFrame Spark DataFrame with new columns added. Examples ------- >>> from pyspark.sql.functions import struct >>> df = spark.sparkContext.parallelize(zip( ... [26.8566983, 26.24497], ... [-26.9677112, -26.7569436], ... ["1", "2"])).toDF(["ra", "dec", "objectId"]) # Nest the DataFrame as for alerts >>> df = df.select(struct(df.columns).alias("candidate"))\ .select(struct("candidate").alias("decoded")) # Perform cross-match >>> processors = ['fink_science.xmatch.processor.cdsxmatch'] >>> df = apply_user_defined_processors(df, processors) >>> new_colnames = ["decoded.candidate.*", "cdsxmatch"] >>> df = df.select(new_colnames) >>> df.show() # doctest: +NORMALIZE_WHITESPACE +----------+-----------+--------+---------+ | ra| dec|objectId|cdsxmatch| +----------+-----------+--------+---------+ |26.8566983|-26.9677112| 1| Star| | 26.24497|-26.7569436| 2| Unknown| +----------+-----------+--------+---------+ <BLANKLINE> """ logger = get_fink_logger(__name__, "INFO") flatten_schema = return_flatten_names(df, pref="", flatten_schema=[]) # Loop over user-defined processors for processor_func_name in processor_names: # Load the processor proc_name = processor_func_name.split('.')[-1] module_name = processor_func_name.split('.' + proc_name)[0] module = importlib.import_module(module_name) processor_func = getattr(module, proc_name, None) # Note: to access input argument, we need f.func and not just f. # This is because f has a decorator on it. ninput = processor_func.func.__code__.co_argcount # Note: This works only with `struct` fields - not `array` argnames = processor_func.func.__code__.co_varnames[:ninput] colnames = [] for argname in argnames: colname = [ col(i) for i in flatten_schema if i.endswith("{}".format(argname)) ] if len(colname) == 0: raise AssertionError(""" Column name {} is not a valid column of the DataFrame. """.format(argname)) colnames.append(colname[0]) df = df.withColumn(processor_func.__name__, processor_func(*colnames)) logger.info("new processor registered: {} from {}".format( proc_name, module_name)) return df
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="mergeAndClean_{}".format(args.night)) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = '{}/raw/year={}/month={}/day={}'.format( args.online_data_prefix, year, month, day) input_science = '{}/science/year={}/month={}/day={}'.format( args.online_data_prefix, year, month, day) # basepath output_raw = '{}/raw/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) output_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) print('Raw data processing....') df_raw = spark.read.format('parquet').load(input_raw) print('Num partitions before: ', df_raw.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_raw)) df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_raw))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_raw) print('Science data processing....') df_science = spark.read.format('parquet').load(input_science) npart_after = int(numPart(df_science)) print('Num partitions before: ', df_science.rdd.getNumPartitions()) print('Num partitions after : ', npart_after) # Add tracklet information before merging df_trck = add_tracklet_information(df_science) # join back information to the initial dataframe df_science = df_science\ .join( F.broadcast(df_trck.select(['candid', 'tracklet'])), on='candid', how='outer' ) df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(npart_after)\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply level one processor: cdsxmatch logger.info("New processor: cdsxmatch") colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] df = df.withColumn(cdsxmatch.__name__, cdsxmatch(*colnames)) # Apply level one processor: rfscore logger.info("New processor: rfscore") # Required alert columns what = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Use for creating temp name prefix = 'c' what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements for colname in what: df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification. # Note we can omit the model_path argument, and in that case the # default model `data/models/default-model.obj` will be used. rfscore_args = [F.col(i) for i in what_prefix] df = df.withColumn(rfscore.__name__, rfscore(*rfscore_args)) # Apply level one processor: rfscore logger.info("New processor: microlensing") # Retrieve schema schema = load_mulens_schema_twobands() # Create standard UDF mulens_udf = F.udf(mulens, schema) # Required alert columns - already computed for SN what_prefix_mulens = [ 'cfid', 'cmagpsf', 'csigmapsf', 'cmagnr', 'csigmagnr', 'cmagzpsci', 'cisdiffpos' ] mulens_args = [F.col(i) for i in what_prefix_mulens] df = df.withColumn('mulens', mulens_udf(*mulens_args)) # Drop temp columns df = df.drop(*what_prefix) # Partition the data hourly df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .withColumn("hour", F.date_format("timestamp", "HH")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day", "hour") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint' # Connect to the TMP science database df = connect_to_raw_database( scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' # Retrieve time-series information to_expand = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Append temp columns with historical + current measurements prefix = 'c' for colname in to_expand: df = concat_col(df, colname, prefix=prefix) # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457 for colname in to_expand: df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c') broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf' # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", checkpointpath_kafka + topicname)\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def apply_user_defined_filter(df: DataFrame, toapply: str) -> DataFrame: """Apply a user filter to keep only wanted alerts. Parameters ---------- df: DataFrame Spark DataFrame with alert data toapply: string Filter name to be applied. It should be in the form module.module.routine (see example below). Returns ------- df: DataFrame Spark DataFrame with filtered alert data Examples ------- >>> from pyspark.sql.functions import struct >>> colnames = ["cdsxmatch", "rb", "magdiff"] >>> df = spark.sparkContext.parallelize(zip( ... ['RRLyr', 'Unknown', 'Star', 'SN1a'], ... [0.01, 0.02, 0.6, 0.01], ... [0.02, 0.05, 0.1, 0.01])).toDF(colnames) >>> df.show() # doctest: +NORMALIZE_WHITESPACE +---------+----+-------+ |cdsxmatch| rb|magdiff| +---------+----+-------+ | RRLyr|0.01| 0.02| | Unknown|0.02| 0.05| | Star| 0.6| 0.1| | SN1a|0.01| 0.01| +---------+----+-------+ <BLANKLINE> # Nest the DataFrame as for alerts >>> df = df.select(struct(df.columns).alias("candidate"))\ .select(struct("candidate").alias("decoded")) # Apply quality cuts for example (level one) >>> toapply = 'fink_filters.filter_rrlyr.filter.rrlyr' >>> df = apply_user_defined_filter(df, toapply) >>> df.select("decoded.candidate.*").show() # doctest: +NORMALIZE_WHITESPACE +---------+----+-------+ |cdsxmatch| rb|magdiff| +---------+----+-------+ | RRLyr|0.01| 0.02| +---------+----+-------+ <BLANKLINE> # Using a wrong filter name will lead to an error >>> df = apply_user_defined_filter( ... df, "unknownfunc") # doctest: +SKIP """ logger = get_fink_logger(__name__, "INFO") flatten_schema = return_flatten_names(df, pref="", flatten_schema=[]) # Load the filter filter_name = toapply.split('.')[-1] module_name = toapply.split('.' + filter_name)[0] module = importlib.import_module(module_name) filter_func = getattr(module, filter_name, None) # Note: to access input argument, we need f.func and not just f. # This is because f has a decorator on it. ninput = filter_func.func.__code__.co_argcount # Note: This works only with `struct` fields - not `array` argnames = filter_func.func.__code__.co_varnames[:ninput] colnames = [] for argname in argnames: colname = [ col(i) for i in flatten_schema if i.endswith("{}".format(argname)) ] if len(colname) == 0: raise AssertionError(""" Column name {} is not a valid column of the DataFrame. """.format(argname)) colnames.append(colname[0]) logger.info("new filter/topic registered: {} from {}".format( filter_name, module_name)) return df\ .withColumn("toKeep", filter_func(*colnames))\ .filter("toKeep == true")\ .drop("toKeep")