Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint'

    # Connect to the TMP science database
    df = connect_to_raw_database(
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    # Retrieve time-series information
    to_expand = [
        'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci',
        'isdiffpos'
    ]

    # Append temp columns with historical + current measurements
    prefix = 'c'
    for colname in to_expand:
        df = concat_col(df, colname, prefix=prefix)

    # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457
    for colname in to_expand:
        df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c')

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf'

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", checkpointpath_kafka + topicname)\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Read the catalog file generated by raw2science
    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog) as f:
        catalog = json.load(f)

    # Define variables
    min_timestamp = 100     # set a default
    t_end = 1577836799      # some default value

    # get distribution offset
    min_timestamp = get_distribution_offset(
        args.checkpointpath_dist, args.startingOffset_dist)

    # Get topic name to publish on
    topic = args.distribution_topic
    broker_list = args.distribution_servers

    # Run distribution for (args.exit_after) seconds
    if args.exit_after is not None:
        t_end = time.time() + args.exit_after
        exit_after = True
    else:
        exit_after = False

    # Start the distribution service
    while(not exit_after or time.time() < t_end):
        """Keep scanning the HBase for new records in a loop
        """
        # Scan the HBase till current time
        max_timestamp = int(round(time.time() * 1000))  # time in ms

        # Read Hbase within timestamp range
        df = spark.read\
            .option("catalog", catalog)\
            .option("minStamp", min_timestamp)\
            .option("maxStamp", max_timestamp)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .load()

        # Keep records that haven't been distributed
        df = df.filter("status!='distributed'")

        # Send out slack alerts
        api_token = get_api_token()
        if api_token:
            slack_cols = [
                "objectId", "candidate_ra",
                "candidate_dec", "cross_match_alerts_per_batch"]
            send_slack_alerts(df.select(slack_cols), args.slack_channels)

        # Apply additional filters (user defined xml)
        if args.distribution_rules_xml:
            df = filter_df_using_xml(df, args.distribution_rules_xml)

        # create a nested dataframe similar to the original ztf dataframe
        df_nested = group_df_into_struct(df, "candidate", "objectId")
        df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId")

        # Apply level two filters
        df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names)

        # Persist df to memory to materialize changes
        df_nested.persist()

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_nested, args.distribution_schema)

        # Ensure that the topic(s) exist on the Kafka Server)
        df_kafka\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topic)\
            .save()

        # Update the status in Hbase and commit checkpoint to file
        update_status_in_hbase(
            df, args.science_db_name, "objectId",
            args.checkpointpath_dist, max_timestamp)

        # update min_timestamp for next iteration
        min_timestamp = max_timestamp

        # free the memory
        df_nested.unpersist()

        # Wait for some time before another loop
        time.sleep(1)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the TMP science database
    df = connect_to_raw_database(args.scitmpdatapath,
                                 args.scitmpdatapath + "/*",
                                 latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = userfilter.split('.')[-1]

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", args.checkpointpath_kafka + topicname)\
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Get or create a Spark Session
    spark = init_sparksession(
        name="distribution", shuffle_partitions=2, log_level="ERROR")

    # Read the catalog file generated by raw2science
    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog) as f:
        catalog = json.load(f)

    # Define variables
    min_timestamp = 100     # set a default
    t_end = 1577836799      # some default value

    # get distribution offset
    min_timestamp = get_distribution_offset(
                        args.checkpointpath_dist, args.startingOffset_dist)

    # Run distribution for (args.exit_after) seconds
    if args.exit_after is not None:
        t_end = time.time() + args.exit_after
        exit_after = True
    else:
        exit_after = False

    # Start the distribution service
    while(not exit_after or time.time() < t_end):
        """Keep scanning the HBase for new records in a loop
        """
        # Scan the HBase till current time
        max_timestamp = int(round(time.time() * 1000)) # time in ms

        # Read Hbase within timestamp range
        df = spark.read\
                  .option("catalog", catalog)\
                  .option("minStamp", min_timestamp)\
                  .option("maxStamp", max_timestamp)\
                  .format("org.apache.spark.sql.execution.datasources.hbase")\
                  .load()

        # Filter out records that have been distributed
        df = df.filter("status!='distributed'")

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df, args.distribution_schema)

        # Publish Kafka topic(s) (Ensure that the topic(s) exist on the Kafka Server)
        df_kafka\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", "localhost:9093")\
            .option("topic", "distribution_test")\
            .save()

        # Update the status column in Hbase
        update_status_in_hbase(df, args.science_db_name, "objectId",
                args.checkpointpath_dist, max_timestamp)

        # update min_timestamp for next iteration
        min_timestamp = max_timestamp

        # Wait for some time before another loop
        time.sleep(1)