예제 #1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Grab the running Spark Session,
    # otherwise create it.
    spark = init_sparksession(name="readingScienceDB", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    with open(args.science_db_catalog) as f:
        catalog = json.load(f)

    catalog_dic = json.loads(catalog)

    df = spark.read.option("catalog", catalog)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .load()

    print("Number of entries in {}: ".format(catalog_dic["table"]["name"]),
          df.count())

    print(
        "Number of distinct objects in {}: ".format(
            catalog_dic["table"]["name"]),
        df.select('objectId').distinct().count())
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    df = connect_to_raw_database(args.rawdatapath,
                                 args.rawdatapath + "/*",
                                 latestfirst=False)

    # Apply quality cuts
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df_partitionedby = df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_sci_tmp) \
        .option("path", args.scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="raw2science_{}".format(args.night),
        shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = 'ztf_alerts/science_reprocessed'

    df = spark.read.format('parquet').load(input_raw)

    # Apply level one filters
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="save_schema_{}".format(args.night), shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    df = load_parquet_files(input_science)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index('timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index('cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index('cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index('cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index('prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    df_kafka = df.selectExpr(cnames)

    path_for_avro = 'new_schema_{}.avro'.format(time())
    df_kafka.limit(1).write.format("avro").save(path_for_avro)

    # retrieve data on local disk
    subprocess.run(["hdfs", "dfs", '-get', path_for_avro])

    # Read the avro schema from .avro file
    avro_file = glob.glob(path_for_avro + "/part*")[0]
    avro_schema = readschemafromavrofile(avro_file)

    # Write the schema to a file for decoding Kafka messages
    with open('schemas/{}'.format(path_for_avro.replace('.avro', '.avsc')), 'w') as f:
        json.dump(avro_schema, f, indent=2)
예제 #5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribution_test", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Topic to read from
    topic = args.distribution_topic
    broker_list = args.distribution_servers

    # Read from the Kafka topic
    df_kafka = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", broker_list) \
        .option("kafka.security.protocol", "SASL_PLAINTEXT")\
        .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
        .option("subscribe", topic) \
        .load()

    # Decode df_kafka into a Spark DataFrame with StructType column
    df = decode_kafka_df(df_kafka, args.distribution_schema)

    # Print received stream to the console
    df = df.select("struct.*")

    print("\nReading Fink OutStream\n")
    debug_query = df.writeStream\
        .format("console")\
        .trigger(processingTime='2 seconds')\
        .start()

    # Keep the Streaming running for some time
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        debug_query.stop()
        logger.info("Exiting distribution_test service normally...")
    else:
        debug_query.awaitTermination()
예제 #6
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="checkstream", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Trigger the streaming computation,
    # by defining the sink (memory here) and starting it
    countquery = df \
        .writeStream \
        .queryName("qraw")\
        .format("console")\
        .outputMode("update") \
        .start()

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, 2, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the checkstream service normally...")
    else:
        countquery.awaitTermination()
예제 #7
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    rawdatapath = args.online_data_prefix + '/raw'
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint'

    df = connect_to_raw_database(
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Apply quality cuts
    logger.info("Applying quality cuts")
    df = df\
        .filter(df['candidate.nbad'] == 0)\
        .filter(df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", checkpointpath_sci_tmp) \
        .option("path", scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .trigger(processingTime='{} seconds'.format(args.tinterval)) \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #8
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="index_archival_{}_{}".format(args.index_table, args.night),
        shuffle_partitions=2
    )

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix,
        args.night[:4],
        args.night[4:6],
        args.night[6:8]
    )
    df = load_parquet_files(path)

    # construct the index view
    index_row_key_name = args.index_table
    columns = index_row_key_name.split('_')
    names = [col(i) for i in columns]
    index_name = '.' + columns[0]

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    if 'upper' in args.index_table:
        df = df.select(
            'objectId',
            'prv_candidates.jd',
            'prv_candidates.fid',
            'prv_candidates.magpsf',
            'prv_candidates.sigmapsf',
            'prv_candidates.diffmaglim'
        )
    else:
        df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, _ = attach_rowkey(df)

    common_cols = [
        'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr',
        'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb',
        'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag',
        'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim',
        'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos',
        'cdsxmatch',
        'roid',
        'mulens',
        'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia',
        'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet'
    ]

    if columns[0].startswith('pixel'):
        nside = int(columns[0].split('pixel')[1])

        df_index = df.withColumn(
            columns[0],
            ang2pix(
                df['ra'],
                df['dec'],
                lit(nside)
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + ['objectId']
        )
    elif columns[0] == 'class':
        df_index = df.withColumn(
            'class',
            extract_fink_classification(
                df['cdsxmatch'],
                df['roid'],
                df['mulens'],
                df['snn_snia_vs_nonia'],
                df['snn_sn_vs_all'],
                df['rf_snia_vs_nonia'],
                df['ndethist'],
                df['drb'],
                df['classtar'],
                df['jd'],
                df['jdstarthist'],
                df['rf_kn_vs_nonkn'],
                df['tracklet']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )
    elif columns[0] == 'ssnamenr':
        # Flag only objects with likely counterpart in MPC
        df_index = df\
            .filter(df['roid'] == 3)\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'tracklet':
        # For data < 2021-08-10, no tracklet means ''
        # For data >= 2021-08-10, no tracklet means 'null'
        df_index = df\
            .filter(df['tracklet'] != 'null')\
            .filter(df['tracklet'] != '')\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'upper':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only upper limits
        df_index = df_ex.filter(~df_ex['magpsf'].isNotNull())
        # drop NaN columns
        df_index = df_index.drop(*['magpsf', 'sigmapsf'])
    elif columns[0] == 'uppervalid':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only valid measurements from the history
        df_index = df_ex.filter(df_ex['magpsf'].isNotNull())
    elif columns[0] == 'tns':
        with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
            tns_marker = f.read().replace('\n', '')

        pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker)

        # Filter TNS confirmed data
        f1 = ~pdf_tns['type'].isna()
        pdf_tns_filt = pdf_tns[f1]

        pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt)

        @pandas_udf(StringType(), PandasUDFType.SCALAR)
        def crossmatch_with_tns(objectid, ra, dec):
            # TNS
            pdf = pdf_tns_filt_b.value
            ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type']

            # create catalogs
            catalog_ztf = SkyCoord(
                ra=np.array(ra, dtype=np.float) * u.degree,
                dec=np.array(dec, dtype=np.float) * u.degree
            )
            catalog_tns = SkyCoord(
                ra=np.array(ra2, dtype=np.float) * u.degree,
                dec=np.array(dec2, dtype=np.float) * u.degree
            )

            # cross-match
            idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf)

            sub_pdf = pd.DataFrame({
                'objectId': objectid.values[idx],
                'ra': ra.values[idx],
                'dec': dec.values[idx],
            })

            # cross-match
            idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns)

            # set separation length
            sep_constraint2 = d2d2.degree < 1.5 / 3600

            sub_pdf['TNS'] = [''] * len(sub_pdf)
            sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]]

            to_return = objectid.apply(
                lambda x: '' if x not in sub_pdf['objectId'].values
                else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0]
            )

            return to_return

        df = df.withColumn(
            'tns',
            crossmatch_with_tns(
                df['objectId'],
                df['ra'],
                df['dec']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols + ['tns']
        ).cache()
        df_index = df.filter(df['tns'] != '').drop('tns')
        # trigger the cache - not the cache might be a killer for LSST...
        n = df_index.count()
        print('TNS objects: {}'.format(n))
    else:
        df_index = df.select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_index.schema,
        args.science_db_name + index_name,
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_index.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_index = df_index.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_index_schema = construct_schema_row(
        df_index,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_index_schema.schema,
        args.science_db_name + index_name,
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_index_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()
예제 #9
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Not very satisfactory... The problem is that latesfirst = false is
    # required to create new HBase table (i.e. all the time in the CI).
    # If you have a better idea, let me know!
    if "travis" in args.science_db_name:
        latesfirst = False
    else:
        latesfirst = True

    df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*",
                                 latesfirst)

    # Apply level one filters
    logger.info(filter_levelone_names)
    df = apply_user_defined_filters(df, filter_levelone_names)

    # Apply level one processors
    logger.info(processor_levelone_names)
    df = apply_user_defined_processors(df, processor_levelone_names)

    # Select alert data + timestamp + added value from processors
    new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"]
    for i in processor_levelone_names:
        new_colnames.append(i)

    df = df.selectExpr(new_colnames)

    df_hbase = flattenstruct(df, "candidate")
    df_hbase = flattenstruct(df_hbase, "cutoutScience")
    df_hbase = flattenstruct(df_hbase, "cutoutTemplate")
    df_hbase = flattenstruct(df_hbase, "cutoutDifference")
    df_hbase = explodearrayofstruct(df_hbase, "prv_candidates")

    # Create a status column for distribution
    df_hbase = df_hbase.withColumn("status", lit("dbUpdate"))

    # Save the catalog on disk for later usage
    catalog = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema, args.science_db_name, "objectId")

    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog, 'w') as json_file:
        json.dump(catalog, json_file)

    def write_to_hbase_and_monitor(df: DataFrame, epochid: int,
                                   hbcatalog: str):
        """Write data into HBase.

        The purpose of this function is to write data to HBase using
        Structured Streaming tools such as foreachBatch.

        Parameters
        ----------
        df : DataFrame
            Input micro-batch DataFrame.
        epochid : int
            ID of the micro-batch
        hbcatalog : str
            HBase catalog describing the data

        """
        # If the table does not exist, one needs to specify
        # the number of zones to use (must be greater than 3).
        # TODO: remove this harcoded parameter.
        df.write\
            .options(catalog=hbcatalog, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

    # Query to push data into HBase
    countquery = df_hbase\
        .writeStream\
        .outputMode("append")\
        .option("checkpointLocation", args.checkpointpath_sci)\
        .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\
        .start()

    # Query to group objects by type according to SIMBAD
    # Do it every 30 seconds
    groupedquery_started = False
    if "cross_match_alerts_per_batch" in processor_levelone_names:
        df_group = df.groupBy("cross_match_alerts_per_batch").count()
        groupquery = df_group\
            .writeStream\
            .outputMode("complete") \
            .foreachBatch(write_to_csv)\
            .trigger(processingTime='30 seconds'.format(args.tinterval))\
            .start()
        groupedquery_started = True

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        if groupedquery_started:
            groupquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=None)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    # data path
    input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = args.agg_data_prefix + '/science'

    df = spark.read.format('parquet').load(input_raw)
    npart = df.rdd.getNumPartitions()

    # Apply level one filters
    logger.info(qualitycuts)
    df = df.filter(df['candidate.nbad'] == 0).filter(
        df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add tracklet information
    df_trck = spark.read.format('parquet').load(input_raw)
    df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter(
        df_trck['candidate.rb'] >= 0.55)
    df_trck = add_tracklet_information(df_trck)

    # join back information to the initial dataframe
    df = df\
        .join(
            F.broadcast(df_trck.select(['candid', 'tracklet'])),
            on='candid',
            how='outer'
        )

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    df.coalesce(npart).write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
예제 #11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="TNS_report_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix,
                                                       args.night[:4],
                                                       args.night[4:6],
                                                       args.night[6:8])
    df = load_parquet_files(path)

    with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
        tns_marker = f.read().replace('\n', '')

    if not args.tns_sandbox:
        print("WARNING: submitting to real (not sandbox) TNS website")

    if args.tns_sandbox:
        url_tns_api = "https://sandbox.wis-tns.org/api"
        with open('{}/sandbox-tns_api.key'.format(args.tns_folder)) as f:
            # remove line break...
            key = f.read().replace('\n', '')
    else:
        url_tns_api = "https://www.wis-tns.org/api"
        with open('{}/tns_api.key'.format(args.tns_folder)) as f:
            # remove line break...
            key = f.read().replace('\n', '')

    cols = [
        'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all',
        'rf_snia_vs_nonia', 'candidate.ndethist', 'candidate.drb',
        'candidate.classtar', 'candidate.jd', 'candidate.jdstarthist',
        'rf_kn_vs_nonkn', 'tracklet'
    ]
    df = df.withColumn('class', extract_fink_classification(*cols))

    pdf = df\
        .filter(df['class'] == 'Early SN candidate')\
        .filter(df['candidate.ndethist'] <= 20)\
        .toPandas()

    pdf_unique = pdf.groupby('objectId')[pdf.columns].min()
    print("{} new alerts".format(len(pdf)))
    print("{} new sources".format(len(pdf_unique)))
    pdf = pdf_unique

    ids = []
    report = {"at_report": {}}
    check_tns = False
    for index, row in enumerate(pdf.iterrows()):
        alert = row[1]
        past_ids = read_past_ids(args.tns_folder)
        if alert['objectId'] in past_ids.values:
            print('{} already sent!'.format(alert['objectId']))
            continue
        if check_tns:
            groupid = retrieve_groupid(key, tns_marker, alert['objectId'])
            if groupid > 0:
                print("{} already reported by {}".format(
                    alert['objectId'], groupid))
            else:
                print('New report for object {}'.format(alert['objectId']))
        photometry, non_detection = extract_discovery_photometry(alert)
        report['at_report']["{}".format(index)] = build_report(
            alert, photometry, non_detection)
        ids.append(alert['objectId'])
    print('new objects: ', ids)

    if len(ids) != 0:
        json_report = save_logs_and_return_json_report(name='{}{}{}'.format(
            args.night[:4], args.night[4:6], args.night[6:8]),
                                                       folder=args.tns_folder,
                                                       ids=ids,
                                                       report=report)
        r = send_json_report(key, url_tns_api, json_report, tns_marker)
        print(r.json())

        # post to slack
        slacktxt = ' \n '.join(
            ['https://fink-portal/{}'.format(i) for i in ids])
        slacktxt = '{} \n '.format(args.night) + slacktxt
        r = requests.post(os.environ['TNSWEBHOOK'],
                          json={
                              'text': slacktxt,
                              "username": "******"
                          },
                          headers={'Content-Type': 'application/json'})
        print(r.status_code)
    else:
        slacktxt = '{} \n No new sources'.format(args.night)
        r = requests.post(os.environ['TNSWEBHOOK'],
                          json={
                              'text': slacktxt,
                              "username": "******"
                          },
                          headers={'Content-Type': 'application/json'})
예제 #12
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the TMP science database
    df = connect_to_raw_database(args.scitmpdatapath,
                                 args.scitmpdatapath + "/*",
                                 latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = userfilter.split('.')[-1]

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", args.checkpointpath_kafka + topicname)\
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #13
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="mergeAndClean_{}".format(args.night))

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = '{}/year={}/month={}/day={}'.format(args.rawdatapath, year,
                                                    month, day)
    input_science = '{}/year={}/month={}/day={}'.format(
        args.scitmpdatapath, year, month, day)

    # basepath
    output_raw = 'ztf_alerts/raw'
    output_science = 'ztf_alerts/science'

    print('Raw data processing....')
    df_raw = spark.read.format('parquet').load(input_raw)
    print('Num partitions before: ', df_raw.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_raw))

    df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_raw))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_raw)

    print('Science data processing....')

    df_science = spark.read.format('parquet').load(input_science)
    print('Num partitions before: ', df_science.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_science))

    df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_science))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)

    # Remove temporary alert folder - beware you'll never get it back!
    if args.fs == 'hdfs':
        subprocess.run(["hdfs", "dfs", '-rm', '-rf', args.datapath])
    elif args.fs == 'local':
        subprocess.run(['rm', '-rf', args.datapath])
    else:
        print('Filesystem not understood. FS_KIND must be hdfs or local.')
예제 #14
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Read the catalog file generated by raw2science
    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog) as f:
        catalog = json.load(f)

    # Define variables
    min_timestamp = 100     # set a default
    t_end = 1577836799      # some default value

    # get distribution offset
    min_timestamp = get_distribution_offset(
        args.checkpointpath_dist, args.startingOffset_dist)

    # Get topic name to publish on
    topic = args.distribution_topic
    broker_list = args.distribution_servers

    # Run distribution for (args.exit_after) seconds
    if args.exit_after is not None:
        t_end = time.time() + args.exit_after
        exit_after = True
    else:
        exit_after = False

    # Start the distribution service
    while(not exit_after or time.time() < t_end):
        """Keep scanning the HBase for new records in a loop
        """
        # Scan the HBase till current time
        max_timestamp = int(round(time.time() * 1000))  # time in ms

        # Read Hbase within timestamp range
        df = spark.read\
            .option("catalog", catalog)\
            .option("minStamp", min_timestamp)\
            .option("maxStamp", max_timestamp)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .load()

        # Keep records that haven't been distributed
        df = df.filter("status!='distributed'")

        # Send out slack alerts
        api_token = get_api_token()
        if api_token:
            slack_cols = [
                "objectId", "candidate_ra",
                "candidate_dec", "cross_match_alerts_per_batch"]
            send_slack_alerts(df.select(slack_cols), args.slack_channels)

        # Apply additional filters (user defined xml)
        if args.distribution_rules_xml:
            df = filter_df_using_xml(df, args.distribution_rules_xml)

        # create a nested dataframe similar to the original ztf dataframe
        df_nested = group_df_into_struct(df, "candidate", "objectId")
        df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId")

        # Apply level two filters
        df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names)

        # Persist df to memory to materialize changes
        df_nested.persist()

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_nested, args.distribution_schema)

        # Ensure that the topic(s) exist on the Kafka Server)
        df_kafka\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topic)\
            .save()

        # Update the status in Hbase and commit checkpoint to file
        update_status_in_hbase(
            df, args.science_db_name, "objectId",
            args.checkpointpath_dist, max_timestamp)

        # update min_timestamp for next iteration
        min_timestamp = max_timestamp

        # free the memory
        df_nested.unpersist()

        # Wait for some time before another loop
        time.sleep(1)
예제 #15
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="statistics_{}".format(args.night),
        shuffle_partitions=2
    )

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Statistics for {}/{}/{}'.format(year, month, day))

    input_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)

    df_raw = spark.read.format('parquet').load(input_raw)
    df_sci = spark.read.format('parquet').load(input_science)

    df_sci = df_sci.cache()

    # Number of alerts
    n_raw_alert = df_raw.count()
    n_sci_alert = df_sci.count()

    out_dic = {}
    out_dic['raw'] = n_raw_alert
    out_dic['sci'] = n_sci_alert

    # matches with SIMBAD
    n_simbad = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'] != 'Unknown')\
        .count()

    out_dic['simbad_tot'] = n_simbad

    # Alerts with a close-by candidate host-galaxy
    list_simbad_galaxies = [
        "galaxy",
        "Galaxy",
        "EmG",
        "Seyfert",
        "Seyfert_1",
        "Seyfert_2",
        "BlueCompG",
        "StarburstG",
        "LSB_G",
        "HII_G",
        "High_z_G",
        "GinPair",
        "GinGroup",
        "BClG",
        "GinCl",
        "PartofG",
    ]

    n_simbad_gal = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\
        .count()

    out_dic['simbad_gal'] = n_simbad_gal

    df_class = df_sci.withColumn(
        'class',
        extract_fink_classification(
            df_sci['cdsxmatch'],
            df_sci['roid'],
            df_sci['mulens'],
            df_sci['snn_snia_vs_nonia'],
            df_sci['snn_sn_vs_all'],
            df_sci['rf_snia_vs_nonia'],
            df_sci['candidate.ndethist'],
            df_sci['candidate.drb'],
            df_sci['candidate.classtar'],
            df_sci['candidate.jd'],
            df_sci['candidate.jdstarthist'],
            df_sci['rf_kn_vs_nonkn'],
            df_sci['tracklet']
        )
    )

    out_class = df_class.groupBy('class').count().collect()
    out_class_ = [o.asDict() for o in out_class]
    out_class_ = [list(o.values()) for o in out_class_]
    for kv in out_class_:
        out_dic[kv[0]] = kv[1]

    # Number of fields
    n_field = df_sci.select('candidate.field').distinct().count()

    out_dic['fields'] = n_field

    # number of measurements per band
    n_g = df_sci.select('candidate.fid').filter('fid == 1').count()
    n_r = df_sci.select('candidate.fid').filter('fid == 2').count()

    out_dic['n_g'] = n_g
    out_dic['n_r'] = n_r

    # Number of exposures
    n_exp = df_sci.select('candidate.jd').distinct().count()

    out_dic['exposures'] = n_exp

    out_dic['night'] = 'ztf_{}'.format(args.night)

    # make a Spark DataFrame
    pdf = pd.DataFrame([out_dic])
    df_hbase = spark.createDataFrame(pdf)

    # rowkey is the night YYYYMMDD
    index_row_key_name = 'night'

    # Columns to use
    cols_basic = [
        'raw',
        'sci',
        'night',
        'n_g',
        'n_r',
        'exposures',
        'fields'
    ]

    cols_class_ = np.transpose(out_class_)[0]
    cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal']))

    # column families
    cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns}
    cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns})

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema,
        'statistics_class',
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_hbase.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_hbase = df_hbase.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_hbase_schema = construct_schema_row(
        df_hbase,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_hbase_schema.schema,
        'statistics_class',
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_hbase_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()
예제 #16
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import slack
from fink_broker.tester import spark_unit_tests
from pyspark.sql import DataFrame
from fink_broker.loggingUtils import get_fink_logger

logger = get_fink_logger(__name__, "INFO")


class FinkSlackClient:
    def __init__(self, api_token):
        self._client = slack.WebClient(token=api_token)

        try:
            self._client.auth_test()
        except Exception:
            logger.error("Authentication Error: Invalid Token")

        # create a dict of {channelName: ID}
        channels = self._client.channels_list()['channels']
        self._channel_ids = {
            x['name']: x['id']
예제 #17
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    kerberos = 'public2.alerts.ztf' in args.servers
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False,
                          kerberos=kerberos)

    # Get Schema of alerts
    alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    if '134.158.' in args.servers or 'localhost' in args.servers:
        # using custom from_avro (not available for Spark 2.4.x)
        # it will be available from Spark 3.0 though
        df_decoded = df.select(
            [from_avro(df["value"], alert_schema_json).alias("decoded")])
    elif 'public2.alerts.ztf' in args.servers:
        # Decode on-the-fly using fastavro
        f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema)
        df_decoded = df.select([f(df['value']).alias("decoded")])
    else:
        msg = "Data source {} is not known - a decoder must be set".format(
            args.servers)
        logger.warn(msg)
        spark.stop()

    # Flatten the data columns to match the incoming alert data schema
    cnames = df_decoded.columns
    cnames[cnames.index('decoded')] = 'decoded.*'
    df_decoded = df_decoded.selectExpr(cnames)

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("year", "month", "day")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
    else:
        countquery = countquery_tmp.start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
예제 #18
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp", "topic",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))\
        .withColumn("hour", date_format("timestamp", "HH"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("topic", "year", "month", "day", "hour")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
        ui_refresh = args.tinterval
    else:
        countquery = countquery_tmp.start()
        # Update the UI every 2 seconds to place less load on the browser.
        ui_refresh = 2

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "history.csv", "history")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
예제 #19
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="science_archival_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix,
                                                       args.night[:4],
                                                       args.night[4:6],
                                                       args.night[6:8])
    df = load_parquet_files(path)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, row_key_name = attach_rowkey(df)

    # construct the hbase catalog
    hbcatalog = construct_hbase_catalog_from_flatten_schema(
        df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf)

    # Save the catalog on disk (local)
    with open(args.science_db_catalog, 'w') as json_file:
        json.dump(hbcatalog, json_file)

    if args.save_science_db_catalog_only:
        # Print for visual inspection
        print(hbcatalog)
    else:
        # Push the data using the shc connector
        df.write\
            .options(catalog=hbcatalog, newtable=50)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

        # Construct the schema row - inplace replacement
        schema_row_key_name = 'schema_version'
        df = df.withColumnRenamed(row_key_name, schema_row_key_name)

        df_schema = construct_schema_row(df,
                                         rowkeyname=schema_row_key_name,
                                         version='schema_{}_{}'.format(
                                             fbvsn, fsvsn))

        # construct the hbase catalog for the schema
        hbcatalog_schema = construct_hbase_catalog_from_flatten_schema(
            df_schema.schema,
            args.science_db_name,
            rowkeyname=schema_row_key_name,
            cf=cf)

        # Save the catalog on disk (local)
        catname = args.science_db_catalog.replace('.json', '_schema_row.json')
        with open(catname, 'w') as json_file:
            json.dump(hbcatalog_schema, json_file)

        # Push the data using the shc connector
        df_schema.write\
            .options(catalog=hbcatalog_schema, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()
예제 #20
0
def apply_user_defined_processors(df: DataFrame, processor_names: list):
    """Apply iteratively user processors to give added values to the stream.

    Each processor will add one new column to the input DataFrame. The name
    of the column will be the name of the processor routine.

    Parameters
    ----------
    df: DataFrame
        Spark DataFrame with alert data
    processor_names: list of string
        List containing processor names to be applied. These processors should
        come from the fink-science module (see example below).

    Returns
    -------
    df: DataFrame
        Spark DataFrame with new columns added.

    Examples
    -------
    >>> from pyspark.sql.functions import struct
    >>> df = spark.sparkContext.parallelize(zip(
    ...   [26.8566983, 26.24497],
    ...   [-26.9677112, -26.7569436],
    ...   ["1", "2"])).toDF(["ra", "dec", "objectId"])

    # Nest the DataFrame as for alerts
    >>> df = df.select(struct(df.columns).alias("candidate"))\
        .select(struct("candidate").alias("decoded"))

    # Perform cross-match
    >>> processors = ['fink_science.xmatch.processor.cdsxmatch']
    >>> df = apply_user_defined_processors(df, processors)
    >>> new_colnames = ["decoded.candidate.*", "cdsxmatch"]
    >>> df = df.select(new_colnames)
    >>> df.show() # doctest: +NORMALIZE_WHITESPACE
    +----------+-----------+--------+---------+
    |        ra|        dec|objectId|cdsxmatch|
    +----------+-----------+--------+---------+
    |26.8566983|-26.9677112|       1|     Star|
    |  26.24497|-26.7569436|       2|  Unknown|
    +----------+-----------+--------+---------+
    <BLANKLINE>

    """
    logger = get_fink_logger(__name__, "INFO")

    flatten_schema = return_flatten_names(df, pref="", flatten_schema=[])

    # Loop over user-defined processors
    for processor_func_name in processor_names:

        # Load the processor
        proc_name = processor_func_name.split('.')[-1]
        module_name = processor_func_name.split('.' + proc_name)[0]
        module = importlib.import_module(module_name)
        processor_func = getattr(module, proc_name, None)

        # Note: to access input argument, we need f.func and not just f.
        # This is because f has a decorator on it.
        ninput = processor_func.func.__code__.co_argcount

        # Note: This works only with `struct` fields - not `array`
        argnames = processor_func.func.__code__.co_varnames[:ninput]
        colnames = []
        for argname in argnames:
            colname = [
                col(i) for i in flatten_schema
                if i.endswith("{}".format(argname))
            ]
            if len(colname) == 0:
                raise AssertionError("""
                    Column name {} is not a valid column of the DataFrame.
                    """.format(argname))
            colnames.append(colname[0])

        df = df.withColumn(processor_func.__name__, processor_func(*colnames))

        logger.info("new processor registered: {} from {}".format(
            proc_name, module_name))

    return df
예제 #21
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="mergeAndClean_{}".format(args.night))

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.online_data_prefix, year, month, day)
    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.online_data_prefix, year, month, day)

    # basepath
    output_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    output_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)

    print('Raw data processing....')
    df_raw = spark.read.format('parquet').load(input_raw)
    print('Num partitions before: ', df_raw.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_raw))

    df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_raw))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_raw)

    print('Science data processing....')

    df_science = spark.read.format('parquet').load(input_science)
    npart_after = int(numPart(df_science))
    print('Num partitions before: ', df_science.rdd.getNumPartitions())
    print('Num partitions after : ', npart_after)

    # Add tracklet information before merging
    df_trck = add_tracklet_information(df_science)

    # join back information to the initial dataframe
    df_science = df_science\
        .join(
            F.broadcast(df_trck.select(['candid', 'tracklet'])),
            on='candid',
            how='outer'
        )

    df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(npart_after)\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
예제 #22
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    df = connect_to_raw_database(args.rawdatapath,
                                 args.rawdatapath + "/*",
                                 latestfirst=False)

    # Apply level one filters
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply level one processor: cdsxmatch
    logger.info("New processor: cdsxmatch")
    colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']]
    df = df.withColumn(cdsxmatch.__name__, cdsxmatch(*colnames))

    # Apply level one processor: rfscore
    logger.info("New processor: rfscore")
    # Required alert columns
    what = [
        'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci',
        'isdiffpos'
    ]

    # Use for creating temp name
    prefix = 'c'
    what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    for colname in what:
        df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification.
    # Note we can omit the model_path argument, and in that case the
    # default model `data/models/default-model.obj` will be used.
    rfscore_args = [F.col(i) for i in what_prefix]
    df = df.withColumn(rfscore.__name__, rfscore(*rfscore_args))

    # Apply level one processor: rfscore
    logger.info("New processor: microlensing")

    # Retrieve schema
    schema = load_mulens_schema_twobands()

    # Create standard UDF
    mulens_udf = F.udf(mulens, schema)

    # Required alert columns - already computed for SN
    what_prefix_mulens = [
        'cfid', 'cmagpsf', 'csigmapsf', 'cmagnr', 'csigmagnr', 'cmagzpsci',
        'cisdiffpos'
    ]

    mulens_args = [F.col(i) for i in what_prefix_mulens]
    df = df.withColumn('mulens', mulens_udf(*mulens_args))

    # Drop temp columns
    df = df.drop(*what_prefix)

    # Partition the data hourly
    df_partitionedby = df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .withColumn("hour", F.date_format("timestamp", "HH"))

    # Append new rows in the tmp science database
    countquery = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_sci_tmp) \
        .option("path", args.scitmpdatapath)\
        .partitionBy("year", "month", "day", "hour") \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #23
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint'

    # Connect to the TMP science database
    df = connect_to_raw_database(
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    # Retrieve time-series information
    to_expand = [
        'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci',
        'isdiffpos'
    ]

    # Append temp columns with historical + current measurements
    prefix = 'c'
    for colname in to_expand:
        df = concat_col(df, colname, prefix=prefix)

    # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457
    for colname in to_expand:
        df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c')

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf'

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", checkpointpath_kafka + topicname)\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
예제 #24
0
def apply_user_defined_filter(df: DataFrame, toapply: str) -> DataFrame:
    """Apply a user filter to keep only wanted alerts.

    Parameters
    ----------
    df: DataFrame
        Spark DataFrame with alert data
    toapply: string
        Filter name to be applied. It should be in the form
        module.module.routine (see example below).

    Returns
    -------
    df: DataFrame
        Spark DataFrame with filtered alert data

    Examples
    -------
    >>> from pyspark.sql.functions import struct
    >>> colnames = ["cdsxmatch", "rb", "magdiff"]
    >>> df = spark.sparkContext.parallelize(zip(
    ...   ['RRLyr', 'Unknown', 'Star', 'SN1a'],
    ...   [0.01, 0.02, 0.6, 0.01],
    ...   [0.02, 0.05, 0.1, 0.01])).toDF(colnames)
    >>> df.show() # doctest: +NORMALIZE_WHITESPACE
    +---------+----+-------+
    |cdsxmatch|  rb|magdiff|
    +---------+----+-------+
    |    RRLyr|0.01|   0.02|
    |  Unknown|0.02|   0.05|
    |     Star| 0.6|    0.1|
    |     SN1a|0.01|   0.01|
    +---------+----+-------+
    <BLANKLINE>


    # Nest the DataFrame as for alerts
    >>> df = df.select(struct(df.columns).alias("candidate"))\
        .select(struct("candidate").alias("decoded"))

    # Apply quality cuts for example (level one)
    >>> toapply = 'fink_filters.filter_rrlyr.filter.rrlyr'
    >>> df = apply_user_defined_filter(df, toapply)
    >>> df.select("decoded.candidate.*").show() # doctest: +NORMALIZE_WHITESPACE
    +---------+----+-------+
    |cdsxmatch|  rb|magdiff|
    +---------+----+-------+
    |    RRLyr|0.01|   0.02|
    +---------+----+-------+
    <BLANKLINE>

    # Using a wrong filter name will lead to an error
    >>> df = apply_user_defined_filter(
    ...   df, "unknownfunc") # doctest: +SKIP
    """
    logger = get_fink_logger(__name__, "INFO")

    flatten_schema = return_flatten_names(df, pref="", flatten_schema=[])

    # Load the filter
    filter_name = toapply.split('.')[-1]
    module_name = toapply.split('.' + filter_name)[0]
    module = importlib.import_module(module_name)
    filter_func = getattr(module, filter_name, None)

    # Note: to access input argument, we need f.func and not just f.
    # This is because f has a decorator on it.
    ninput = filter_func.func.__code__.co_argcount

    # Note: This works only with `struct` fields - not `array`
    argnames = filter_func.func.__code__.co_varnames[:ninput]
    colnames = []
    for argname in argnames:
        colname = [
            col(i) for i in flatten_schema if i.endswith("{}".format(argname))
        ]
        if len(colname) == 0:
            raise AssertionError("""
                Column name {} is not a valid column of the DataFrame.
                """.format(argname))
        colnames.append(colname[0])

    logger.info("new filter/topic registered: {} from {}".format(
        filter_name, module_name))

    return df\
        .withColumn("toKeep", filter_func(*colnames))\
        .filter("toKeep == true")\
        .drop("toKeep")