def process_valid_file(message: KafkaDStream, data_path: str, v_path: str, sensor_id: str, interval: int): """ Read convert gzip file data into json object and publish it on Kafka :param message: """ # print("====== Processing in process_valid_file ======") records = message.map(lambda r: json.loads(r[1])) # matadata & filename # print(records.collect()) valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path)) # print("File Iteration count-valid_records:", valid_records.count()) # print("====== Processing in verify_sid ======") valid_sensors = valid_records.filter( lambda rdd: verify_sid(rdd, sensor_id, data_path)) # print("File Iteration count-valid_sensors:", valid_sensors.count()) # print(valid_sensors.collect()) print("====== Processing in extract_info ======") results = valid_sensors.map( lambda rdd: extract_info(rdd, data_path, v_path)) # used to be rdd of list [identifier, owner, name, data_descriptor, start_time, end_time, datapoints] # now just the file within window print("Result is: ") print(results.collect())
def kafka_to_db(message: KafkaDStream): """ :param message: """ records = message.map(lambda r: json.loads(r[1])) valid_records = records.filter(verify_fields) valid_records.foreach(lambda stream_data: store_streams(stream_data)) storeOffsetRanges(message) print("Ready to process stream...")
def kafka_file_to_json_producer(message: KafkaDStream, data_path, config_filepath, CC): """ Read convert gzip file data into json object and publish it on Kafka :param message: """ records = message.map(lambda r: json.loads(r[1])) valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path)) results = valid_records.map( lambda msg: save_data(msg, data_path, config_filepath)) print("File Iteration count:", results.count()) store_offset_ranges(message, CC)
def kafka_file_to_json_producer(message: KafkaDStream, data_path): """ Read convert gzip file data into json object and publish it on Kafka :param message: """ records = message.map(lambda r: json.loads(r[1])) valid_records = records.filter(lambda rdd: verify_fields(rdd, data_path)) results = valid_records.map( lambda rdd: file_processor(rdd, data_path)).map(store_stream) storeOffsetRanges(message) print("File Iteration count:", results.count())
def dStreamTokafkadStream( ssc, stream ): return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)