def disconnect(self): try: if not self.hard_close and self.channel is not None and self.channel.is_open: self.channel.stop_consuming() console_out(f"Cancelled consumer", self.get_actor()) self.connection.sleep(2) if self.is_connection_open(): self.connection.close() console_out(f"Closed connection", self.get_actor()) return True except AttributeError: console_out( f"Closed connection (with internal pika attribute error)", self.get_actor()) except TypeError: console_out(f"Closed connection (with internal pika type error)", self.get_actor()) except pika.exceptions.ConnectionWrongStateError: console_out(f"Cannot close connection, already closed", self.get_actor()) except pika.exceptions.StreamLostError: console_out(f"Closed connection (stream lost)", self.get_actor()) except Exception as e: console_out_exception("Failed trying to disconnect.", e, self.get_actor()) return False
def teardown_all(self, configurations, key_pair, run_tag, no_destroy): try: console_out(self.actor, f"Getting logs") start_node, end_node = self.get_start_end_nodes(configurations) self.get_logs(key_pair, run_tag, start_node, end_node) console_out(self.actor, f"Logs retrieved") except Exception as e: console_out_exception(self.actor, "Failed retrieving logs", e) if no_destroy: console_out(self.actor, "No teardown as --no-destroy set to true") else: console_out(self.actor, "Terminating all servers") for config_tag in configurations: console_out(self.actor, f"TEARDOWN FOR configuration {config_tag}") unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] for n in range(0, unique_conf.cluster_size): node_num = int(unique_conf.node_number) + n console_out(self.actor, f"TEARDOWN FOR node {node_num}") self.teardown(unique_conf.technology, str(node_num), run_tag, no_destroy) console_out(self.actor, "All servers terminated") exit(1)
def stop_start_consumer(self, con_index, hard_close): con = self.consumers[con_index] try: if self.use_toxiproxy: console_out( f"SIMULATING CRASH OF CONSUMER {con_index+1} --------------------------------------", self.actor) self.broker_manager.disable_consumer_proxy( con.get_consumer_id()) time.sleep(1) con.perform_hard_close() time.sleep(1) self.broker_manager.enable_consumer_proxy( con.get_consumer_id()) else: console_out( f"STOPPING CONSUMER {con_index+1} --------------------------------------", self.actor) if hard_close: con.perform_hard_close() else: con.stop_consuming() self.consumer_threads[con_index].join(15) con.connect() self.consumer_threads[con_index] = threading.Thread( target=con.consume) self.consumer_threads[con_index].start() except Exception as e: console_out_exception("Failed to stop/start consumer correctly", e, self.actor)
def send_to_broker(): try: nonproxy_publisher.publish_msg_with_new_conn( "", "", f"Hello at {datetime.datetime.now()}") return "" except Exception as e: console_out_exception("NoProxy", e, "WEB") return str(e)
def start_random_single_action_and_repair(self, min_duration_seconds, max_duration_seconds): while self.stop_random == False: try: self.single_action_and_repair(min_duration_seconds, max_duration_seconds) except Exception as e: console_out_exception("Failed performing action and repair", e, "TEST RUNNER")
def close_connection(self): if self.connection is not None and self.connection.is_open: try: console_out("Closing connection...", self.get_actor()) self.connection.close() console_out("Connection closed", self.get_actor()) except pika.execeptions.ConnectionWrongStateError: console_out("Cannot close connection, already closed", self.get_actor()) except Exception as e: console_out_exception("Failed closing connection", e, self.get_actor())
def is_connection_open(self): try: if self.connection is None: return False return self.connection.is_open except Exception as e: console_out_exception("Failed checking if connection is open", e, self.get_actor()) return False
def get_logs_of_all_configs(self, common_conf, configurations): for config_tag in configurations: unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] try: start_node, end_node = self.get_start_end_nodes_of_config(unique_conf) self.get_logs(common_conf, unique_conf.logs_volume, start_node, end_node) except Exception as e: console_out_exception(self.actor, "Failed retrieving logs", e)
def add_proxy(self, name): try: r = requests.post( "http://toxiproxy:8474/proxies", data="{\"name\":\"" + name + "\",\"listen\":\"0.0.0.0:5672\",\"upstream\":\"" + self.mgmt_ip + ":5672\"}") console_out(f"Proxy add response: {r}", "TEST RUNNER") return r.status_code == 201 or r.status_code == 204 or r.status_code == 409 except Exception as e: console_out_exception("Could not add proxy", e, "TEST RUNNER") return False
def open_persistent_connection(self): url = self.broker_manager.get_url(self.use_proxy) console_out(f"Attempting to connect to {url}", self.get_actor()) try: parameters = pika.URLParameters(url) self.connection = pika.BlockingConnection(parameters) self.channel = self.connection.channel() if self.use_confirms: self.channel.confirm_delivery() return True except Exception as e: console_out_exception("Connection failed", e, self.get_actor()) return False
def start_random_consumer_actions(self, min_seconds_interval, max_seconds_interval, hard_close): while self.stop_random == False: wait_sec = random.randint(min_seconds_interval, max_seconds_interval) console_out(f"Will execute consumer action in {wait_sec} seconds", self.actor) self.wait_for(wait_sec) if self.stop_random == False: try: self.do_consumer_action(hard_close) except Exception as e: console_out_exception("Failed performing consumer action", e, "TEST RUNNER")
def start_random_stop_starts(self, min_seconds_interval, max_seconds_interval, hard_close): while self.stop_random == False: wait_sec = random.randint(min_seconds_interval, max_seconds_interval) console_out( f"Will execute stop/start consumer action in {wait_sec} seconds", self.actor) self.wait_for(wait_sec) if self.stop_random == False: try: self.stop_start_consumers(hard_close) except Exception as e: console_out_exception("Failed stopping/starting consumers", e, self.actor)
def get_logs_of_all_configs(self, common_conf, configurations): for config_tag in configurations: unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] if unique_conf.deployment == "ec2": try: start_node, end_node = self.get_start_end_nodes_of_config( unique_conf) self.get_logs(common_conf, unique_conf.logs_volume, start_node, end_node) except Exception as e: console_out_exception(self.actor, "Failed retrieving logs", e) elif unique_conf.deployment == "eks" or unique_conf.deployment == "gke": console_out(self.actor, "Log gathering not yet supported for EKS/GKE") else: raise Exception( f"Invalid deployment type: {unique_conf.deployment}")
def publish_msg_with_new_conn(self, send_to_exchange, rk, body): url = self.broker_manager.get_url(self.use_proxy) try: parameters = pika.URLParameters(url) connection = pika.BlockingConnection(parameters) channel = connection.channel() mandatory = False if self.use_confirms: channel.confirm_delivery() mandatory = True corr_id = str(uuid.uuid4()) try: channel.basic_publish(exchange=send_to_exchange, routing_key=rk, body=body, mandatory=mandatory, properties=pika.BasicProperties( content_type='text/plain', delivery_mode=2, correlation_id=corr_id)) self.pos_acks += 1 except exceptions.UnroutableError: self.undeliverable += 1 if self.undeliverable % 100 == 0: console_out( f"{str(self.undeliverable)} messages could not be delivered", self.get_actor()) except exceptions.NackError: self.neg_acks += 1 connection.close() except Exception as e: console_out_exception(f"Connection to {url} failed", e, self.get_actor())
def connect(self): try: self.connected_node = self.broker_manager.get_current_node( self.consumer_id) ip = self.broker_manager.get_node_ip(self.connected_node) console_out(f"Connecting to {self.connected_node}", self.get_actor()) credentials = pika.PlainCredentials('jack', 'jack') parameters = pika.ConnectionParameters( ip, self.broker_manager.get_consumer_port(self.connected_node, self.consumer_id), '/', credentials) self.connection = pika.BlockingConnection(parameters) self.channel = self.connection.channel() if self.prefetch > 0: self.channel.basic_qos(prefetch_count=self.prefetch) return True except Exception as e: console_out_exception("Failed trying to connect.", e, self.get_actor()) return False
def main(): print("random-test.py") #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit test_name = get_mandatory_arg(args, "--test-name") tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") analyze = is_true(get_optional_arg(args, "--analyze", "true")) if queue_type == "quorum": qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) sac_enabled = is_true(get_mandatory_arg(args, "--sac")) log_messages = is_true(get_optional_arg(args, "--log-msgs", "false")) publisher_count = int(get_optional_arg(args, "--publishers", "1")) if publisher_count > 0: in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) print_mod = int( get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) else: print_mod = int(get_optional_arg(args, "--print-mod", f"1000")) new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true")) cluster_size = get_optional_arg(args, "--cluster", "3") rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash", ["crash", "close", "cancel"]) use_toxiproxy = False consumer_hard_close = False if stop_mode == "crash": use_toxiproxy = True elif stop_mode == "close": consumer_hard_close = True include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true")) if include_chaos: chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) include_con_actions = is_true( get_optional_arg(args, "--consumer-actions", "true")) if include_con_actions: con_action_min_interval = int( get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int( get_optional_arg(args, "--consumer-max-interval", "60")) failed_test_log = list() failed_tests = set() for test_number in range(tests): print("") subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"]) console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, use_toxiproxy) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, cluster_size) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, cluster_size) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, qq_max_length) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(test_name, test_number, print_mod, analyze, log_messages) chaos = ChaosExecutor(initial_nodes) if include_chaos: if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() if consumer_count > 0: consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", use_toxiproxy) consumer_manager.add_consumers(consumer_count, test_number, queue_name, prefetch) consumer_manager.start_consumers() if publisher_count == 1: publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, sequence_count) pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(chaos_min_interval, chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(con_action_min_interval, con_action_max_interval, consumer_hard_close)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out( f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out( f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() if consumer_count > 0: consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join(30) if include_con_actions: consumer_action_thread.join(30) except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") if publisher_count > 0: publisher.stop_publishing() if consumer_count > 0: console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() console_out("Starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 try: while ctr < grace_period_sec: if publisher_count > 0 and msg_monitor.get_unique_count( ) >= publisher.get_pos_ack_count() and len( publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") console_out("RESULTS ----------------------------------------", "TEST RUNNER") if publisher_count > 0: confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference( msg_monitor.get_msg_set()) console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") else: not_consumed_msgs = set() console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if consumer_count > 0: if len(not_consumed_msgs) > 0: if sac_enabled: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") else: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages." ) failed_tests.add(test_number) lost_ctr = 0 sorted_msgs = list(not_consumed_msgs) sorted_msgs.sort() for msg in sorted_msgs: console_out(f"Lost? {msg}", "TEST RUNNER") lost_ctr += 1 if lost_ctr > 500: console_out("More than 500, truncated list", "TEST RUNNER") break success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Received out-of-order messages" ) failed_tests.add(test_number) if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join(30) msg_monitor.stop_consuming() monitor_thread.join(30) except Exception as e: console_out_exception("Failed to clean up test correctly.", e, "TEST RUNNER") broker_manager.zip_log_files(test_name, test_number) console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER") console_out("", "TEST RUNNER") console_out("SUMMARY", "TEST RUNNER") console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}", "TEST RUNNER") for line in failed_test_log: console_out(line, "TEST RUNNER") console_out("TEST RUN COMPLETE", "TEST RUNNER")
def consume(self): self.terminate = False self.hard_close = False self.last_msg = "" while True: try: if self.terminate == True: break if self.connection is None or self.connection.is_closed or self.channel is None or self.channel.is_closed: if self.reconnect() == False: self.wait_for(5) continue self.consumer_tag = self.channel.basic_consume( self.queue_name, self.callback) console_out( f"Consuming queue: {self.queue_name} with consumer tag: {self.consumer_tag}", self.get_actor()) self.set_actor() self.channel.start_consuming() except pika.exceptions.ConnectionClosed as e: if self.terminate == True: break console_out_exception( f"Connection was closed. Last msg acked: {self.last_msg}", e, self.get_actor()) self.wait_for(5) continue except pika.exceptions.AMQPChannelError as e: if self.terminate == True: break console_out_exception( f"Caught a channel error. Last msg acked: {self.last_msg}", e, self.get_actor()) self.wait_for(5) if self.disconnect(): self.connected_node = "none" continue else: self.terminate = True console_out("Aborting consumer", self.get_actor()) break except pika.exceptions.AMQPConnectionError as e: if self.terminate == True: break console_out_exception( f"Connection error. Last msg acked: {self.last_msg}", e, self.get_actor()) self.wait_for(5) continue except Exception as e: if self.terminate == True: break console_out_exception( f"Unexpected error. Last msg acked: {self.last_msg}", e, self.get_actor()) self.wait_for(5) if self.disconnect(): self.connected_node = "none" continue else: self.terminate = True console_out("Aborting consumer", self.get_actor()) break