Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="checkstream", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Trigger the streaming computation,
    # by defining the sink (memory here) and starting it
    countquery = df \
        .writeStream \
        .queryName("qraw")\
        .format("console")\
        .outputMode("update") \
        .start()

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, 2, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the checkstream service normally...")
    else:
        countquery.awaitTermination()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    kerberos = 'public2.alerts.ztf' in args.servers
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False,
                          kerberos=kerberos)

    # Get Schema of alerts
    alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    if '134.158.' in args.servers or 'localhost' in args.servers:
        # using custom from_avro (not available for Spark 2.4.x)
        # it will be available from Spark 3.0 though
        df_decoded = df.select(
            [from_avro(df["value"], alert_schema_json).alias("decoded")])
    elif 'public2.alerts.ztf' in args.servers:
        # Decode on-the-fly using fastavro
        f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema)
        df_decoded = df.select([f(df['value']).alias("decoded")])
    else:
        msg = "Data source {} is not known - a decoder must be set".format(
            args.servers)
        logger.warn(msg)
        spark.stop()

    # Flatten the data columns to match the incoming alert data schema
    cnames = df_decoded.columns
    cnames[cnames.index('decoded')] = 'decoded.*'
    df_decoded = df_decoded.selectExpr(cnames)

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("year", "month", "day")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
    else:
        countquery = countquery_tmp.start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp", "topic",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))\
        .withColumn("hour", date_format("timestamp", "HH"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("topic", "year", "month", "day", "hour")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
        ui_refresh = args.tinterval
    else:
        countquery = countquery_tmp.start()
        # Update the UI every 2 seconds to place less load on the browser.
        ui_refresh = 2

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath,
                           "history.csv", "history")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()