def test_one_one(): ts = time.time() # create an instance of the StreamBuffer class stream_buffer = StreamBuffer(instant_emit=True, delta_time=sys.maxsize, left="r", buffer_results=True, verbose=True) # create Queues to store the input streams events_r = list() events_s = list() # Fill the input_stream with randomized Records N = 100 random.seed(0) event_order = ["r", "s"] * int(N / 2) start_time = 1600000000 for i in range(len(event_order)): if event_order[i] == "r": events_r.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) elif event_order[i] == "s": events_s.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) ingestion_order = ["r", "s"] * int(N/2) # works n_r = n_s = 0 for i in range(N): # decide based on the ingestion order which stream record is forwarded # store as dict of KafkaRecords and a flag whether it was already joined as older sibling if ingestion_order[i] == "r": # receive the first record from the event stream stream_buffer.ingest_left(events_r[n_r]) # instant emit n_r += 1 elif ingestion_order[i] == "s": # receive the first record from the event stream stream_buffer.ingest_right(events_s[n_s]) n_s += 1 # print("\nRecords in buffer r:") # for rec in stream_buffer.buffer_left: # print(rec) # print("Records in buffer s:") # for rec in stream_buffer.buffer_right: # print(rec) # print("Merged records in buffer t:") events_t = stream_buffer.fetch_results() # for rec in events_t: # print(rec) print(f"Join time-series with |r| = {n_r}, |s| = {n_s}.") print(f"joined {len(events_t)} tuples in {time.time() - ts} s.") assert len(events_t) == 99
def test_randomized_many(): # create an instance of the StreamBuffer class stream_buffer = StreamBuffer(instant_emit=True, delta_time=sys.maxsize, left="r", buffer_results=True, verbose=False) # Test Settings: # Create Queues to store the input streams events_r = list() events_s = list() # Fill the input_stream with randomized n_r = n_s = 10_000 random.seed(0) start_time = 1600000000 phenomenon_time = start_time for i in range(n_r): phenomenon_time += random.random() events_r.append(Record(timestamp=phenomenon_time, quantity="r", result=random.random())) phenomenon_time = start_time for i in range(n_s): phenomenon_time += random.random() events_s.append(Record(timestamp=phenomenon_time, quantity="s", result=random.random())) ingestion_order = ["r"] * n_r + ["s"] * n_s random.shuffle(ingestion_order) n_r = n_s = 0 ts = time.time() for quantity in ingestion_order: # decide based on the ingestion order which stream record is forwarded # store as dict of KafkaRecords and a flag whether it was already joined as older sibling if quantity == "r": # receive the first record from the event stream stream_buffer.ingest_left(events_r[n_r]) # instant emit n_r += 1 elif quantity == "s": # receive the first record from the event stream stream_buffer.ingest_right(events_s[n_s]) n_s += 1 events_t = stream_buffer.fetch_results() stop_time = time.time() print(f"Join time-series with |r| = {n_r}, |s| = {n_s}.") print(f"joined {len(events_t)} tuples in {time.time() - ts} s.") print(f"that are {int(len(events_t)/(time.time() - ts))} joins per second.") assert len(events_t) == 23041 assert stop_time - ts < 2 # we got around 0.4 s
def test_delayed_many(): imbalance = 100 # additional latency of stream s # create an instance of the StreamBuffer class stream_buffer = StreamBuffer(instant_emit=True, delta_time=sys.maxsize, left="r", buffer_results=True, verbose=False) # Test Settings: # Create Queues to store the input streams events_r = list() events_s = list() # Fill the input_stream with randomized N = 10_000 random.seed(0) event_order = (["r"] * 5 + ["s"] * 5) * int(N/10) start_time = 1600000000 for i in range(len(event_order)): if event_order[i] == "r": events_r.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) elif event_order[i] == "s": events_s.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) ingestion_order = ["r"] * imbalance + (["r"] * 5 + ["s"] * 5) * int(N/10) n_r = 0 n_s = 0 ts = time.time() while n_r < len(events_r) and n_s < len(events_s): # decide based on the ingestion order which stream record is forwarded # store as dict of KafkaRecords and a flag whether it was already joined as older sibling if ingestion_order[n_r+n_s] == "r": # receive the first record from the event stream stream_buffer.ingest_left(events_r[n_r]) # instant emit n_r += 1 elif ingestion_order[n_r+n_s] == "s": # receive the first record from the event stream stream_buffer.ingest_right(events_s[n_s]) n_s += 1 events_t = stream_buffer.fetch_results() print(f"Join time-series with |r| = {n_r}, |s| = {n_s}.") print(f"joined {len(events_t)} tuples in {time.time() - ts} s.") print(f"that are {int(len(events_t)/(time.time() - ts))} joins per second.") assert len(events_t) == 13702 assert time.time() - ts < 1 # we got around 0.2 s
def test_timeout_five_five(): # create an instance of the StreamBuffer class stream_buffer = StreamBuffer(instant_emit=True, delta_time=3, left="r", buffer_results=True, verbose=True) # Test Settings: # Create Queues to store the input streams events_r = list() events_s = list() # Fill the input_stream with randomized N = 20 random.seed(0) event_order = (["r"] * 5 + ["s"] * 5) * int(N / 10) start_time = 1600000000 for i in range(len(event_order)): if event_order[i] == "r": events_r.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) elif event_order[i] == "s": events_s.append(Record(timestamp=i + start_time, quantity=event_order[i], result=random.random())) ingestion_order = (["r"] * 5 + ["s"] * 5) * N n_r = n_s = 0 ts = time.time() for i in range(N): # decide based on the ingestion order which stream record is forwarded # store as dict of KafkaRecords and a flag whether it was already joined as older sibling if ingestion_order[i] == "r": # receive the first record from the event stream stream_buffer.ingest_left(events_r[n_r]) # instant emit n_r += 1 elif ingestion_order[i] == "s": # receive the first record from the event stream stream_buffer.ingest_right(events_s[n_s]) n_s += 1 events_t = stream_buffer.fetch_results() print(f"Join time-series with |r| = {n_r}, |s| = {n_s}.") print(f"joined {len(events_t)} tuples in {time.time() - ts} s.") assert len(events_t) == 13
def test_unordered(): # create an instance of the StreamBuffer class stream_buffer = StreamBuffer(instant_emit=True, delta_time=sys.maxsize, left="r", buffer_results=True, verbose=True) # Fill the input_stream with randomized random.seed(0) start_time = 1600000000 # Test Settings: # Create Queues to store the input records events_r = list() for i in range(10): events_r.append(Record(timestamp=i + start_time, quantity="r", result=random.random())) ts = time.time() # first ingest all Records into R, then all into s for event in events_r: stream_buffer.ingest_left(event) # instant emit print("Ingest Records into s.") stream_buffer.ingest_right(Record(timestamp=start_time - 0.5, quantity="s", result=random.random())) stream_buffer.ingest_right(Record(timestamp=start_time + 0.5, quantity="s", result=random.random())) stream_buffer.ingest_right(Record(timestamp=start_time + 5.5, quantity="s", result=random.random())) stream_buffer.ingest_right(Record(timestamp=start_time + 9.5, quantity="s", result=random.random())) events_t = stream_buffer.fetch_results() print(f"Join time-series with |r| = {len(events_r)}, |s| = {4}.") print(f"joined {len(events_t)} tuples in {time.time() - ts} s.") if time.time() - ts > 1e-3: print(f"that are {int(len(events_t)/(time.time() - ts))} joins per second.") assert len(events_t) == 20 d = {'r.quantity': 'r', 'r.phenomenonTime': 1600000006, 'r.result': 0.7837985890347726, 's.quantity': 's', 's.phenomenonTime': 1600000005.5, 's.result': 0.28183784439970383} assert d in events_t
def join_fct(record_left, record_right): """ Blueprint for the join function, takes two records and merges them using the defined routine. :param record_left: Record Record that is joined as left join partner :param record_right: Record Record that is joined as right join partner :return: Record the resulting record from the join of both partners """ record = Record(quantity="t", result=record_left.get_result() * record_right.get_result(), timestamp=(record_left.get_time() + record_right.get_time()) / 2) # here, the resulting record can be produced to e.g. Apache Kafka or a pipeline return record
def test_commit_transaction(round_nr=1): print(f"\n################################ commit, transaction {round_nr} ######################################\n") # start transaction if it is the first round if round_nr == 1: # Initialize producer transaction. kafka_producer.init_transactions() # Start producer transaction for round 1 only kafka_producer.begin_transaction() # commit_fct is empty and join_fct is with transactions lsb = StreamBuffer(instant_emit=True, left="actSpeed_C11", right="vaTorque_C11", buffer_results=True, delta_time=1, verbose=VERBOSE, join_function=join_fct) start_time = stop_time = last_transaction_time = time.time() n_none_polls = 0 started = False while True: # msg = kafka_consumer.poll(0.1) msgs = kafka_consumer.consume(num_messages=MAX_BATCH_SIZE, timeout=0.1) # is faster, returns a list # if there is no msg within a second, continue if n_none_polls >= 30: # time.time() - init_time > MAX_TIMEOUT:, it does need around 2 seconds print(" Break as there won't come any further messages.") break elif len(msgs) == 0: n_none_polls += 1 continue else: # update to latest running-time stop_time = time.time() if not started: # set starter flag if first message was consumed started = True print("Start the count clock") # update to latest not-started-time start_time = stop_time # iterate over each message that was consumed for msg in msgs: record_json = json.loads(msg.value().decode('utf-8')) if VERBOSE: if record_json.get("quantity").endswith("_C11"): print(f"Received new record: {record_json}") # create a Record from the json record = Record( thing=record_json.get("thing"), quantity=record_json.get("quantity"), timestamp=record_json.get("phenomenonTime"), result=record_json.get("result"), topic=msg.topic(), partition=msg.partition(), offset=msg.offset()) # ingest the record into the StreamBuffer instance, instant emit if msg.topic() == KAFKA_TOPIC_IN_0: # "actSpeed_C11": lsb.ingest_left(record) # with instant emit elif msg.topic() == KAFKA_TOPIC_IN_1: # "vaTorque_C11": lsb.ingest_right(record) # commit the transaction every TRANSACTION_TIME if stop_time >= last_transaction_time + TRANSACTION_TIME: last_transaction_time = stop_time commit_transaction(stream_buffer=lsb, verbose=VERBOSE, commit_time=last_transaction_time) # break if there were MAX_JOIN_COUNT or more joins if MAX_JOIN_CNT is not None and lsb.get_join_counter() >= MAX_JOIN_CNT: print("Reached the maximal join count, graceful stopping.") break # sleep to allow other processes to run time.sleep(0) try: # commit processed message offsets to the transaction kafka_producer.send_offsets_to_transaction( kafka_consumer.position(kafka_consumer.assignment()), kafka_consumer.consumer_group_metadata()) # commit transaction kafka_producer.commit_transaction() except confluent_kafka.KafkaException as e: if confluent_kafka.KafkaError.str(e.args[0]) == "Operation not valid in state Ready": print("_STATE exception, should occur here.") else: print("Couldn't commit transaction.") raise e events_out = lsb.fetch_results() print(f"\nLengths: |{RES_QUANTITY}| = {lsb.get_join_counter()}, " f"|{QUANTITIES[0]}| = {lsb.get_left_counter()}, |{QUANTITIES[1]}| = {lsb.get_right_counter()}.") if start_time != stop_time: print(f"Joined time-series {stop_time - start_time:.6f} s long, " f"that are {lsb.get_join_counter() / (stop_time - start_time):.2f} joins per second.") if round_nr == 1: print(f" first record: \t{events_out[0]}") print(f" last record: \t{events_out[-1]}") assert len(events_out) == 1595 # assert cnt_left == 2681 # this values can be different # assert cnt_right == 4705 print(f"Result #0: {events_out[0]}") assert events_out[0].get_quantity() == "vaPower_C11" assert round(events_out[0].get_time() - 1554096460.415, 3) == 0 assert round(events_out[0].get_result() - 86.71966370389097, 5) == 0 assert round(events_out[-1].get_time() - 1554355545.929, 3) == 0 assert round(events_out[-1].get_result() - 0.0, 5) == 0 elif round_nr == 2: assert len(events_out) == 0
continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue try: record_json = json.loads(msg.value().decode('utf-8')) if VERBOSE: print(f"Received new record: {record_json}") if st0 is None: print("Start count clock") st0 = time.time() # create a Record from the json record = Record( thing=record_json.get("thing"), quantity=record_json.get("quantity"), timestamp=record_json.get("phenomenonTime"), result=record_json.get("result")) # ingest the record into the StreamBuffer instance, instant emit if "Torque" in record_json.get("quantity"): stream_buffer.ingest_r(record) # instant emit cnt_r += 1 elif "Load" in record_json.get("quantity"): stream_buffer.ingest_s(record) cnt_s += 1 except json.decoder.JSONDecodeError as e: print("skipping record as there is a json.decoder.JSONDecodeError.") pass except KeyboardInterrupt: kafka_consumer.close()