def process_valid_file(message: KafkaDStream, data_path: str, v_path: str,
                       sensor_id: str, interval: int):
    """
    Read convert gzip file data into json object and publish it on Kafka
    :param message:
    """

    # print("====== Processing in process_valid_file ======")
    records = message.map(lambda r: json.loads(r[1]))  # matadata & filename
    # print(records.collect())
    valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path))
    # print("File Iteration count-valid_records:", valid_records.count())

    # print("====== Processing in verify_sid ======")
    valid_sensors = valid_records.filter(
        lambda rdd: verify_sid(rdd, sensor_id, data_path))
    # print("File Iteration count-valid_sensors:", valid_sensors.count())
    # print(valid_sensors.collect())

    print("====== Processing in extract_info ======")
    results = valid_sensors.map(
        lambda rdd: extract_info(rdd, data_path, v_path))
    # used to be rdd of list [identifier, owner, name, data_descriptor, start_time, end_time, datapoints]
    # now just the file within window
    print("Result is: ")
    print(results.collect())
예제 #2
0
def kafka_to_db(message: KafkaDStream):
    """

    :param message:
    """
    records = message.map(lambda r: json.loads(r[1]))
    valid_records = records.filter(verify_fields)

    valid_records.foreach(lambda stream_data: store_streams(stream_data))

    storeOffsetRanges(message)

    print("Ready to process stream...")
def kafka_file_to_json_producer(message: KafkaDStream, data_path,
                                config_filepath, CC):
    """
    Read convert gzip file data into json object and publish it on Kafka
    :param message:
    """

    records = message.map(lambda r: json.loads(r[1]))
    valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path))
    results = valid_records.map(
        lambda msg: save_data(msg, data_path, config_filepath))
    print("File Iteration count:", results.count())
    store_offset_ranges(message, CC)
def kafka_file_to_json_producer(message: KafkaDStream, data_path):
    """
    Read convert gzip file data into json object and publish it on Kafka
    :param message:
    """
    records = message.map(lambda r: json.loads(r[1]))
    valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path))
    results = valid_records.map(
        lambda rdd: file_processor(rdd, data_path)).map(store_stream)

    storeOffsetRanges(message)

    print("File Iteration count:", results.count())
예제 #5
0
def dStreamTokafkadStream( ssc, stream ):
    return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)