示例#1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    df = connect_to_raw_database(args.rawdatapath,
                                 args.rawdatapath + "/*",
                                 latestfirst=False)

    # Apply quality cuts
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df_partitionedby = df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_sci_tmp) \
        .option("path", args.scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
示例#2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="raw2science_{}".format(args.night),
        shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = 'ztf_alerts/science_reprocessed'

    df = spark.read.format('parquet').load(input_raw)

    # Apply level one filters
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
示例#3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    rawdatapath = args.online_data_prefix + '/raw'
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint'

    df = connect_to_raw_database(
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Apply quality cuts
    logger.info("Applying quality cuts")
    df = df\
        .filter(df['candidate.nbad'] == 0)\
        .filter(df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", checkpointpath_sci_tmp) \
        .option("path", scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .trigger(processingTime='{} seconds'.format(args.tinterval)) \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
示例#4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=None)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    # data path
    input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = args.agg_data_prefix + '/science'

    df = spark.read.format('parquet').load(input_raw)
    npart = df.rdd.getNumPartitions()

    # Apply level one filters
    logger.info(qualitycuts)
    df = df.filter(df['candidate.nbad'] == 0).filter(
        df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add tracklet information
    df_trck = spark.read.format('parquet').load(input_raw)
    df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter(
        df_trck['candidate.rb'] >= 0.55)
    df_trck = add_tracklet_information(df_trck)

    # join back information to the initial dataframe
    df = df\
        .join(
            F.broadcast(df_trck.select(['candid', 'tracklet'])),
            on='candid',
            how='outer'
        )

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    df.coalesce(npart).write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)