def add_hello_msgs_direct_publishers(self, queue_name, count, dup_rate): for pub_id in range(1, self.publisher_count + 1): publisher = RabbitPublisher(pub_id, self.test_number, self.broker_manager, self.in_flight_max, 120, self.print_mod) publisher.configure_hello_msgs_direct(queue_name, count, dup_rate) self.publishers.append(publisher)
def add_hello_msgs_to_exchanges_publishers(self, exchanges, routing_key, count, dup_rate): for pub_id in range(1, self.publisher_count + 1): publisher = RabbitPublisher(pub_id, self.test_number, self.broker_manager, self.in_flight_max, 120, self.print_mod) publisher.configure_hello_msgs_to_exchanges( exchanges, routing_key, count, dup_rate) self.publishers.append(publisher)
def add_partitioned_sequence_to_exchanges_publishers( self, exchanges, count, dup_rate, sequence_count): for pub_id in range(1, self.publisher_count + 1): publisher = RabbitPublisher(pub_id, self.test_number, self.broker_manager, self.in_flight_max, 120, self.print_mod) publisher.configure_partitioned_sequence_to_exchanges( exchanges, count, dup_rate, sequence_count) self.publishers.append(publisher)
def main(): args = get_args(sys.argv) connect_node = get_optional_arg(args, "--node", "rabbitmq1") exchange = get_optional_arg(args, "--ex", "") count = int(get_mandatory_arg(args, "--msgs")) state_count = int(get_mandatory_arg(args, "--keys")) dup_rate = float(get_optional_arg(args, "--dup-rate", "0")) routing_key = get_optional_arg(args, "--rk", "hello") queue = get_optional_arg(args, "--queue", None) partitioned = get_optional_arg(args, "--partitioned", "false") exchanges_arg = get_optional_arg(args, "--exchanges", "") message_type = "sequence" if partitioned == "true": if queue != None: print("Cannot set partitioning mode and set a queue. Must publish to an exchange") exit(1) message_type = "partitioned-sequence" live_nodes = get_live_nodes() publisher = RabbitPublisher("1", live_nodes, connect_node, 1000, 100, 100) if queue != None: print("direct to queue publishing") publisher.publish_direct(queue, count, state_count, dup_rate, message_type) elif len(exchanges_arg) > 0: print("multi-exchange publishing") exchanges = exchanges_arg.split(",") publisher.publish_to_exchanges(exchanges, routing_key, count, state_count, dup_rate, message_type) else: print("single exchange publishing") publisher.publish(exchange, routing_key, count, state_count, dup_rate, message_type)
def main(): args = get_args(sys.argv) connect_node = get_optional_arg(args, "--node", "rabbitmq1") node_count = int(get_optional_arg(args, "--cluster-size", "3")) exchange = get_optional_arg(args, "--ex", "") count = int(get_mandatory_arg(args, "--msgs")) dup_rate = float(get_optional_arg(args, "--dup-rate", "0")) routing_key = get_optional_arg(args, "--rk", "hello") queue = get_optional_arg(args, "--queue", None) message_type = get_optional_arg(args, "--msg-type", "hello") publisher = RabbitPublisher(node_count, connect_node) stats = QueueStats('jack', 'jack', queue) try: if queue != None: print("direct") publisher.publish_direct(queue, count, 1, dup_rate, message_type) else: publisher.publish(exchange, routing_key, count, 1, dup_rate, message_type) queue_length = stats.get_queue_length(connect_node) print(f"Number of message in queue: {queue_length}") except NameError as e: print(f"Unexpected error: {str(e)}")
def main(): args = get_args(sys.argv) connect_node = get_optional_arg(args, "--node", "rabbitmq1") node_count = int(get_optional_arg(args, "--cluster-size", "3")) exchange = get_optional_arg(args, "--ex", "") count = int(get_mandatory_arg(args, "--msgs")) dup_rate = float(get_optional_arg(args, "--dup-rate", "0")) routing_key = get_optional_arg(args, "--rk", "hello") queue = get_optional_arg(args, "--queue", None) message_type = get_optional_arg(args, "--msg-type", "hello") publisher = RabbitPublisher(node_count, connect_node) try: if queue != None: print("direct") publisher.publish_direct(queue, count, 1, dup_rate, message_type) else: publisher.publish(exchange, routing_key, count, 1, dup_rate, message_type) except: print("Publishing aborted, final stats:") print(publisher.print_final_count())
def main(): args = get_args(sys.argv) node_count = 3 count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) actions = int(get_mandatory_arg(args, "--actions")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") sac = get_mandatory_arg(args, "--sac") chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "30")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) message_type = "sequence" queue_type = get_mandatory_arg(args, "--queue-type") sac_enabled = True if sac.upper() == "FALSE": sac_enabled = False for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") pub_node = broker_manager.get_random_init_node() con_node = broker_manager.get_random_init_node() console_out(f"publish to: {pub_node}", "TEST RUNNER") console_out(f"consume from: {con_node}", "TEST RUNNER") print_mod = in_flight_max * 5 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if sac_enabled: queue_created = broker_manager.create_sac_queue( mgmt_node, queue_name, cluster_size, queue_type) else: queue_created = broker_manager.create_queue( mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)", initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") consumer_manager.add_consumers(1, test_number, queue_name) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.publish_direct, args=(queue_name, count, 1, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") for action_num in range(0, actions): wait_sec = random.randint(chaos_min_interval, chaos_max_interval) console_out(f"waiting for {wait_sec} seconds before next action", "TEST RUNNER") time.sleep(wait_sec) console_out( f"execute chaos action {str(action_num)} of test {str(test_number)}", "TEST RUNNER") chaos.execute_chaos_action() subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) console_out("repairing cluster", "TEST RUNNER") chaos.repair() console_out("repaired cluster", "TEST RUNNER") publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") if len(lost_msgs) > 0: console_out(f"Lost messages count: {len(lost_msgs)}", "TEST RUNNER") for msg in lost_msgs: console_out(f"Lost message: {msg}", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if msg_monitor.get_out_of_order() == True: console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER") success = False if len(lost_msgs) > 0: console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER") success = False if success == True: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() con_thread.join() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) node_count = 3 count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") message_type = "sequence" for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") print_mod = 5000 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: queue_created = broker_manager.create_sac_queue( mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") pub_node = broker_manager.get_random_init_node() publisher = RabbitPublisher(str(test_number), initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager.add_consumers(consumer_count, test_number, queue_name) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.publish_direct, args=(queue_name, count, 1, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(90, )) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(5, 30)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 while ctr < run_minutes: time.sleep(60) ctr += 1 console_out( f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left", "TEST RUNNER") try: chaos.stop_random_single_action_and_repair() consumer_manager.stop_random_consumer_actions() chaos_thread.join() consumer_action_thread.join() except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(not_consumed_msgs) > 0: console_out( f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): print("quorum-queue-test.py") args = get_args(sys.argv) count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) actions = int(get_mandatory_arg(args, "--actions")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") sac_enabled = is_true(get_mandatory_arg(args, "--sac")) chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "30")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) for test_number in range(1, tests + 1): print("") console_out( f"TEST RUN: {str(test_number)} of {tests}--------------------------", "TEST RUNNER") setup_complete = False while not setup_complete: broker_manager = BrokerManager() broker_manager.deploy(cluster_size, True, rmq_version, False) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") print_mod = in_flight_max * 5 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False qc_ctr = 0 while queue_created == False and qc_ctr < 20: qc_ctr += 1 if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, 0) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, 0) if queue_created: setup_complete = True else: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor("qqt", test_number, print_mod, True, False) publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, 1) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", False) consumer_manager.add_consumers(1, test_number, queue_name, prefetch) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") for action_num in range(1, actions + 1): wait_sec = random.randint(chaos_min_interval, chaos_max_interval) console_out(f"waiting for {wait_sec} seconds before next action", "TEST RUNNER") time.sleep(wait_sec) console_out( f"execute chaos action {str(action_num)}/{actions} of test {str(test_number)}", "TEST RUNNER") chaos.execute_chaos_action() subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) console_out("repairing cluster", "TEST RUNNER") chaos.repair() console_out("repaired cluster", "TEST RUNNER") publisher.stop_publishing() console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while True: ms_since_last_msg = datetime.datetime.now( ) - msg_monitor.get_last_msg_time() if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break elif ctr > grace_period_sec and ms_since_last_msg.total_seconds( ) > 15: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") if len(lost_msgs) > 0: console_out(f"Lost messages count: {len(lost_msgs)}", "TEST RUNNER") for msg in lost_msgs: console_out(f"Lost message: {msg}", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if msg_monitor.get_out_of_order() == True: console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER") success = False if len(lost_msgs) > 0: console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER") success = False if success == True: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
def main(): print("random-test.py") #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit test_name = get_mandatory_arg(args, "--test-name") tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") analyze = is_true(get_optional_arg(args, "--analyze", "true")) if queue_type == "quorum": qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) sac_enabled = is_true(get_mandatory_arg(args, "--sac")) log_messages = is_true(get_optional_arg(args, "--log-msgs", "false")) publisher_count = int(get_optional_arg(args, "--publishers", "1")) if publisher_count > 0: in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) print_mod = int( get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) else: print_mod = int(get_optional_arg(args, "--print-mod", f"1000")) new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true")) cluster_size = get_optional_arg(args, "--cluster", "3") rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash", ["crash", "close", "cancel"]) use_toxiproxy = False consumer_hard_close = False if stop_mode == "crash": use_toxiproxy = True elif stop_mode == "close": consumer_hard_close = True include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true")) if include_chaos: chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) include_con_actions = is_true( get_optional_arg(args, "--consumer-actions", "true")) if include_con_actions: con_action_min_interval = int( get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int( get_optional_arg(args, "--consumer-max-interval", "60")) failed_test_log = list() failed_tests = set() for test_number in range(tests): print("") subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"]) console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, use_toxiproxy) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, cluster_size) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, cluster_size) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, qq_max_length) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(test_name, test_number, print_mod, analyze, log_messages) chaos = ChaosExecutor(initial_nodes) if include_chaos: if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() if consumer_count > 0: consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", use_toxiproxy) consumer_manager.add_consumers(consumer_count, test_number, queue_name, prefetch) consumer_manager.start_consumers() if publisher_count == 1: publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, sequence_count) pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(chaos_min_interval, chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(con_action_min_interval, con_action_max_interval, consumer_hard_close)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out( f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out( f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() if consumer_count > 0: consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join(30) if include_con_actions: consumer_action_thread.join(30) except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") if publisher_count > 0: publisher.stop_publishing() if consumer_count > 0: console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() console_out("Starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 try: while ctr < grace_period_sec: if publisher_count > 0 and msg_monitor.get_unique_count( ) >= publisher.get_pos_ack_count() and len( publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") console_out("RESULTS ----------------------------------------", "TEST RUNNER") if publisher_count > 0: confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference( msg_monitor.get_msg_set()) console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") else: not_consumed_msgs = set() console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if consumer_count > 0: if len(not_consumed_msgs) > 0: if sac_enabled: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") else: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages." ) failed_tests.add(test_number) lost_ctr = 0 sorted_msgs = list(not_consumed_msgs) sorted_msgs.sort() for msg in sorted_msgs: console_out(f"Lost? {msg}", "TEST RUNNER") lost_ctr += 1 if lost_ctr > 500: console_out("More than 500, truncated list", "TEST RUNNER") break success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Received out-of-order messages" ) failed_tests.add(test_number) if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join(30) msg_monitor.stop_consuming() monitor_thread.join(30) except Exception as e: console_out_exception("Failed to clean up test correctly.", e, "TEST RUNNER") broker_manager.zip_log_files(test_name, test_number) console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER") console_out("", "TEST RUNNER") console_out("SUMMARY", "TEST RUNNER") console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}", "TEST RUNNER") for line in failed_test_log: console_out(line, "TEST RUNNER") console_out("TEST RUN COMPLETE", "TEST RUNNER")
def main(): #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") sac = get_mandatory_arg(args, "--sac") publisher_count = int(get_optional_arg(args, "--publishers", "1")) print_mod = int(get_optional_arg(args, "--print-mod", "0")) new_cluster = get_optional_arg(args, "--new-cluster", "true") in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) cluster_size = get_optional_arg(args, "--cluster", "3") chaos = get_optional_arg(args, "--chaos-actions", "true") chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int(get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int(get_optional_arg(args, "--chaos-max-interval", "120")) consumer_actions = get_optional_arg(args, "--consumer-actions", "true") con_action_min_interval = int(get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int(get_optional_arg(args, "--consumer-max-interval", "60")) if print_mod == 0: print_mod = in_flight_max * 5 include_chaos = True if chaos.upper() == "FALSE": include_chaos = False include_con_actions = True if consumer_actions.upper() == "FALSE": include_con_actions = False sac_enabled = True if sac.upper() == "FALSE": sac_enabled = False message_type = "sequence" for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") if new_cluster.upper() == "TRUE": subprocess.call(["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if sac_enabled: queue_created = broker_manager.create_sac_queue(mgmt_node, queue_name, cluster_size, queue_type) else: queue_created = broker_manager.create_queue(mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") pub_node = broker_manager.get_random_init_node() publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)", initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager.add_consumers(consumer_count, test_number, queue_name) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() if publisher_count == 1: pub_thread = threading.Thread(target=publisher.publish_direct,args=(queue_name, count, sequence_count, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out(f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread(target=chaos.start_random_single_action_and_repair,args=(chaos_min_interval,chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread(target=consumer_manager.start_random_consumer_actions,args=(con_action_min_interval, con_action_max_interval)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out(f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out(f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join() if include_con_actions: consumer_action_thread.join() except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() if publisher_count == 1: publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count() and len(publisher.get_msg_set().difference(msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS ----------------------------------------", "TEST RUNNER") console_out(f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(not_consumed_msgs) > 0: console_out(f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
PORT_MQ = 5672 PORT_REDIS = 6379 USER_MQ = 'admin' PASSWD_MQ = '000000' PASSWD_REDIS ='' VHOST = 'test' att_dict={'func','from'} redis = RedisDb(host=HOST, port=PORT_REDIS, pwd=PASSWD_REDIS ) producer = RabbitPublisher(host=HOST, port=PORT_MQ, user=USER_MQ, pwd=PASSWD_MQ, vhost=VHOST) consumer = RabbitConsumer(host=HOST, port=PORT_MQ, user=USER_MQ, pwd=PASSWD_MQ, vhost=VHOST) def send_to_mq(msg): try: #producer.pu() producer.publish(msg=msg, routing_key='FSReplay')