def update_status_in_hbase(df: DataFrame, database_name: str, rowkey: str, offsetfile: str, timestamp: int): """Update the status column in Hbase Parameters ---------- df: DataFrame A Spark DataFrame created after reading the database (HBase) database_name: str Name of the database rowkey: str Name of the rowkey in the HBase catalog offsetfile: str the path of offset file for distribution timestamp: int timestamp till which science db has been scanned and distributed ---------- """ df = df.select(rowkey, "status") df = df.withColumn("status", lit("distributed")) update_catalog = construct_hbase_catalog_from_flatten_schema( df.schema, database_name, rowkey) df.write\ .option("catalog", update_catalog)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # write offset(timestamp) to file with open(offsetfile, 'w') as f: string = "distributed till, {}".format(timestamp) f.write(string)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="statistics_{}".format(args.night), shuffle_partitions=2 ) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Statistics for {}/{}/{}'.format(year, month, day)) input_raw = '{}/raw/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) input_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) df_raw = spark.read.format('parquet').load(input_raw) df_sci = spark.read.format('parquet').load(input_science) df_sci = df_sci.cache() # Number of alerts n_raw_alert = df_raw.count() n_sci_alert = df_sci.count() out_dic = {} out_dic['raw'] = n_raw_alert out_dic['sci'] = n_sci_alert # matches with SIMBAD n_simbad = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'] != 'Unknown')\ .count() out_dic['simbad_tot'] = n_simbad # Alerts with a close-by candidate host-galaxy list_simbad_galaxies = [ "galaxy", "Galaxy", "EmG", "Seyfert", "Seyfert_1", "Seyfert_2", "BlueCompG", "StarburstG", "LSB_G", "HII_G", "High_z_G", "GinPair", "GinGroup", "BClG", "GinCl", "PartofG", ] n_simbad_gal = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\ .count() out_dic['simbad_gal'] = n_simbad_gal df_class = df_sci.withColumn( 'class', extract_fink_classification( df_sci['cdsxmatch'], df_sci['roid'], df_sci['mulens'], df_sci['snn_snia_vs_nonia'], df_sci['snn_sn_vs_all'], df_sci['rf_snia_vs_nonia'], df_sci['candidate.ndethist'], df_sci['candidate.drb'], df_sci['candidate.classtar'], df_sci['candidate.jd'], df_sci['candidate.jdstarthist'], df_sci['rf_kn_vs_nonkn'], df_sci['tracklet'] ) ) out_class = df_class.groupBy('class').count().collect() out_class_ = [o.asDict() for o in out_class] out_class_ = [list(o.values()) for o in out_class_] for kv in out_class_: out_dic[kv[0]] = kv[1] # Number of fields n_field = df_sci.select('candidate.field').distinct().count() out_dic['fields'] = n_field # number of measurements per band n_g = df_sci.select('candidate.fid').filter('fid == 1').count() n_r = df_sci.select('candidate.fid').filter('fid == 2').count() out_dic['n_g'] = n_g out_dic['n_r'] = n_r # Number of exposures n_exp = df_sci.select('candidate.jd').distinct().count() out_dic['exposures'] = n_exp out_dic['night'] = 'ztf_{}'.format(args.night) # make a Spark DataFrame pdf = pd.DataFrame([out_dic]) df_hbase = spark.createDataFrame(pdf) # rowkey is the night YYYYMMDD index_row_key_name = 'night' # Columns to use cols_basic = [ 'raw', 'sci', 'night', 'n_g', 'n_r', 'exposures', 'fields' ] cols_class_ = np.transpose(out_class_)[0] cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal'])) # column families cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns} cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns}) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, 'statistics_class', rowkeyname=index_row_key_name, cf=cf ) # Push index table df_hbase.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_hbase = df_hbase.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_hbase_schema = construct_schema_row( df_hbase, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_hbase_schema.schema, 'statistics_class', rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_hbase_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="index_archival_{}_{}".format(args.index_table, args.night), shuffle_partitions=2 ) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8] ) df = load_parquet_files(path) # construct the index view index_row_key_name = args.index_table columns = index_row_key_name.split('_') names = [col(i) for i in columns] index_name = '.' + columns[0] # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. if 'upper' in args.index_table: df = df.select( 'objectId', 'prv_candidates.jd', 'prv_candidates.fid', 'prv_candidates.magpsf', 'prv_candidates.sigmapsf', 'prv_candidates.diffmaglim' ) else: df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, _ = attach_rowkey(df) common_cols = [ 'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr', 'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb', 'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag', 'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos', 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet' ] if columns[0].startswith('pixel'): nside = int(columns[0].split('pixel')[1]) df_index = df.withColumn( columns[0], ang2pix( df['ra'], df['dec'], lit(nside) ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + ['objectId'] ) elif columns[0] == 'class': df_index = df.withColumn( 'class', extract_fink_classification( df['cdsxmatch'], df['roid'], df['mulens'], df['snn_snia_vs_nonia'], df['snn_sn_vs_all'], df['rf_snia_vs_nonia'], df['ndethist'], df['drb'], df['classtar'], df['jd'], df['jdstarthist'], df['rf_kn_vs_nonkn'], df['tracklet'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'ssnamenr': # Flag only objects with likely counterpart in MPC df_index = df\ .filter(df['roid'] == 3)\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'tracklet': # For data < 2021-08-10, no tracklet means '' # For data >= 2021-08-10, no tracklet means 'null' df_index = df\ .filter(df['tracklet'] != 'null')\ .filter(df['tracklet'] != '')\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'upper': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only upper limits df_index = df_ex.filter(~df_ex['magpsf'].isNotNull()) # drop NaN columns df_index = df_index.drop(*['magpsf', 'sigmapsf']) elif columns[0] == 'uppervalid': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only valid measurements from the history df_index = df_ex.filter(df_ex['magpsf'].isNotNull()) elif columns[0] == 'tns': with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker) # Filter TNS confirmed data f1 = ~pdf_tns['type'].isna() pdf_tns_filt = pdf_tns[f1] pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt) @pandas_udf(StringType(), PandasUDFType.SCALAR) def crossmatch_with_tns(objectid, ra, dec): # TNS pdf = pdf_tns_filt_b.value ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type'] # create catalogs catalog_ztf = SkyCoord( ra=np.array(ra, dtype=np.float) * u.degree, dec=np.array(dec, dtype=np.float) * u.degree ) catalog_tns = SkyCoord( ra=np.array(ra2, dtype=np.float) * u.degree, dec=np.array(dec2, dtype=np.float) * u.degree ) # cross-match idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf) sub_pdf = pd.DataFrame({ 'objectId': objectid.values[idx], 'ra': ra.values[idx], 'dec': dec.values[idx], }) # cross-match idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns) # set separation length sep_constraint2 = d2d2.degree < 1.5 / 3600 sub_pdf['TNS'] = [''] * len(sub_pdf) sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]] to_return = objectid.apply( lambda x: '' if x not in sub_pdf['objectId'].values else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0] ) return to_return df = df.withColumn( 'tns', crossmatch_with_tns( df['objectId'], df['ra'], df['dec'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols + ['tns'] ).cache() df_index = df.filter(df['tns'] != '').drop('tns') # trigger the cache - not the cache might be a killer for LSST... n = df_index.count() print('TNS objects: {}'.format(n)) else: df_index = df.select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_index.schema, args.science_db_name + index_name, rowkeyname=index_row_key_name, cf=cf ) # Push index table df_index.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_index = df_index.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_index_schema = construct_schema_row( df_index, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_index_schema.schema, args.science_db_name + index_name, rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_index_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="science_archival_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8]) df = load_parquet_files(path) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, row_key_name = attach_rowkey(df) # construct the hbase catalog hbcatalog = construct_hbase_catalog_from_flatten_schema( df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf) # Save the catalog on disk (local) with open(args.science_db_catalog, 'w') as json_file: json.dump(hbcatalog, json_file) if args.save_science_db_catalog_only: # Print for visual inspection print(hbcatalog) else: # Push the data using the shc connector df.write\ .options(catalog=hbcatalog, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df = df.withColumnRenamed(row_key_name, schema_row_key_name) df_schema = construct_schema_row(df, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format( fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_schema = construct_hbase_catalog_from_flatten_schema( df_schema.schema, args.science_db_name, rowkeyname=schema_row_key_name, cf=cf) # Save the catalog on disk (local) catname = args.science_db_catalog.replace('.json', '_schema_row.json') with open(catname, 'w') as json_file: json.dump(hbcatalog_schema, json_file) # Push the data using the shc connector df_schema.write\ .options(catalog=hbcatalog_schema, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Not very satisfactory... The problem is that latesfirst = false is # required to create new HBase table (i.e. all the time in the CI). # If you have a better idea, let me know! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply level one filters logger.info(filter_levelone_names) df = apply_user_defined_filters(df, filter_levelone_names) # Apply level one processors logger.info(processor_levelone_names) df = apply_user_defined_processors(df, processor_levelone_names) # Select alert data + timestamp + added value from processors new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"] for i in processor_levelone_names: new_colnames.append(i) df = df.selectExpr(new_colnames) df_hbase = flattenstruct(df, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor(df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds groupedquery_started = False if "cross_match_alerts_per_batch" in processor_levelone_names: df_group = df.groupBy("cross_match_alerts_per_batch").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() groupedquery_started = True # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() if groupedquery_started: groupquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Grab the running Spark Session, # otherwise create it. spark = init_sparksession( name="buildSciDB", shuffle_partitions=2, log_level="ERROR") # FIXME! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database( args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply filters and keep only good alerts df_filt = df.withColumn( "toKeep", keep_alert_based_on( col("decoded.candidate.nbad"), col("decoded.candidate.rb"), col("decoded.candidate.magdiff") ) ).filter("toKeep == true") # for good alerts, perform a cross-match with SIMBAD, # and return the types of the objects (Star, AGN, Unknown, etc.) df_type = df_filt.withColumn( "simbadType", cross_match_alerts_per_batch( col("decoded.objectId"), col("decoded.candidate.ra"), col("decoded.candidate.dec") ) ).selectExpr( "decoded.*", "cast(timestamp as string) as timestamp", "simbadType") df_hbase = flattenstruct(df_type, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor( df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds df_group = df_type.groupBy("simbadType").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() groupquery.stop() print("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()