示例#1
0
 def __init__(self, args, suffix):
     self.suffix = suffix
     self.config_tag = get_mandatory_arg(args, "--config-tag", self.suffix)
     self.technology = get_mandatory_arg_validated(args, "--technology",
                                                   self.suffix,
                                                   ["rabbitmq"])
     self.cluster_size = int(
         get_optional_arg(args, "--cluster-size", self.suffix, "1"))
     self.broker_version = get_mandatory_arg(args, "--version", self.suffix)
     self.volume_size = get_optional_arg(args, "--volume-size", self.suffix,
                                         "50")  # for GCP deployment only
     self.filesystem = get_mandatory_arg_validated(args, "--filesystem",
                                                   self.suffix,
                                                   ["ext4", "xfs"])
     self.tenancy = get_mandatory_arg_validated(args, "--tenancy",
                                                self.suffix,
                                                ["default", "dedicated"])
     self.core_count = get_mandatory_arg(args, "--core-count", self.suffix)
     self.threads_per_core = get_mandatory_arg(args, "--threads-per-core",
                                               self.suffix)
     self.vars_file = get_optional_arg(
         args, "--vars-file", self.suffix,
         f".variables/{self.technology}-generic-vars.yml")
     self.no_tcp_delay = get_optional_arg(args, "--no-tcp-delay",
                                          self.suffix, "true")
     self.policies_file = get_optional_arg(args, "--policies-file",
                                           self.suffix, "none")
     self.pub_connect_to_node = get_optional_arg_validated(
         args, "--pub-connect-to-node", self.suffix,
         ["roundrobin", "local", "non-local", "random"], "roundrobin")
     self.con_connect_to_node = get_optional_arg_validated(
         args, "--con-connect-to-node", self.suffix,
         ["roundrobin", "local", "non-local", "random"], "roundrobin")
     self.node_number = -1
示例#2
0
    def __init__(self, args, suffix):
        super().__init__(args, suffix)

        self.generic_unix_url = get_optional_arg(args, "--generic-unix-url",
                                                 self.suffix,
                                                 "must-be-using-eks")
        self.instance = get_mandatory_arg(args, "--instance", self.suffix)
        self.volume1_iops_per_gb = get_optional_arg(
            args, "--volume1-iops-per-gb", self.suffix,
            "50")  # only applicable to io1, else ignored
        self.volume2_iops_per_gb = get_optional_arg(
            args, "--volume2-iops-per-gb", self.suffix,
            "50")  # only applicable to io1, else ignored
        self.volume3_iops_per_gb = get_optional_arg(
            args, "--volume3-iops-per-gb", self.suffix,
            "50")  # only applicable to io1, else ignored
        self.volume1_size = get_optional_arg(args, "--volume1-size",
                                             self.suffix, "50")
        self.volume2_size = get_optional_arg(args, "--volume2-size",
                                             self.suffix, "0")
        self.volume3_size = get_optional_arg(args, "--volume3-size",
                                             self.suffix, "0")
        self.volume1_type = get_optional_arg_validated(
            args, "--volume1-type", self.suffix, [
                "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme",
                "pd-ssd"
            ], "ebs-gp2")
        self.volume2_type = get_optional_arg_validated(
            args, "--volume2-type", self.suffix, [
                "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme",
                "pd-ssd"
            ], "ebs-gp2")
        self.volume3_type = get_optional_arg_validated(
            args, "--volume3-type", self.suffix, [
                "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme",
                "pd-ssd"
            ], "ebs-gp2")
        self.volume1_mountpoint = get_optional_arg(args,
                                                   "--volume1-mountpoint",
                                                   self.suffix, "/volume1")
        self.volume2_mountpoint = get_optional_arg(args,
                                                   "--volume2-mountpoint",
                                                   self.suffix, "/volume2")
        self.volume3_mountpoint = get_optional_arg(args,
                                                   "--volume3-mountpoint",
                                                   self.suffix, "/volume3")
        self.data_volume = get_optional_arg(args, "--data-volume", self.suffix,
                                            "volume1")
        self.logs_volume = get_optional_arg(args, "--logs-volume", self.suffix,
                                            "volume1")
        self.quorum_volume = get_optional_arg(args, "--quorum-volume",
                                              self.suffix, "volume1")
        self.wal_volume = get_optional_arg(args, "--wal-volume", self.suffix,
                                           "volume1")
示例#3
0
    def __init__(self, args):
        self.run_id = str(uuid.uuid4())
        self.tags = get_mandatory_arg(args, "--tags", "")
        self.mode = get_optional_arg_validated(args, "--mode", "",
                                               ["logged-benchmark", "model"],
                                               "logged-benchmark")
        self.config_count = int(
            get_optional_arg(args, "--config-count", "", "1"))
        self.new_instance_per_run = is_true(
            get_optional_arg(args, "--new-instance-per-run", "", "false"))
        self.no_destroy = is_true(
            get_optional_arg(args, "--no-destroy", "", "false"))
        self.no_deploy = is_true(
            get_optional_arg(args, "--no-deploy", "", "false"))
        self.run_tag = get_optional_arg(args, "--run-tag", "", "none")
        self.playlist_file = get_mandatory_arg(args, "--playlist-file", "")
        self.background_policies_file = get_optional_arg(
            args, "--bg-policies-file", "", "none")
        self.background_topology_file = get_optional_arg(
            args, "--bg-topology-file", "", "none")
        self.background_delay = int(
            get_optional_arg(args, "--bg-delay", "", "0"))
        self.background_step_seconds = int(
            get_optional_arg(args, "--bg-step-seconds", "", "0"))
        self.background_step_repeat = int(
            get_optional_arg(args, "--bg-step-repeat", "", "0"))
        self.gap_seconds = int(get_mandatory_arg(args, "--gap-seconds", ""))
        self.repeat_count = int(get_optional_arg(args, "--repeat", "", "1"))
        self.parallel_count = int(get_optional_arg(args, "--parallel", "",
                                                   "1"))
        self.override_step_seconds = int(
            get_optional_arg(args, "--override-step-seconds", "", "0"))
        self.override_step_repeat = int(
            get_optional_arg(args, "--override-step-repeat", "", "0"))
        self.override_step_msg_limit = int(
            get_optional_arg(args, "--override-step-msg-limit", "", "0"))
        self.override_broker_hosts = get_optional_arg(
            args, "--override-broker-hosts", "", "")

        self.ami = get_mandatory_arg(args, "--ami", "")
        self.broker_sg = get_mandatory_arg(args, "--broker-sg", "")
        self.loadgen_sg = get_mandatory_arg(args, "--loadgen-sg", "")
        self.loadgen_instance = get_mandatory_arg(args, "--loadgen-instance",
                                                  "")
        self.subnet = get_mandatory_arg(args, "--subnet", "")
        self.key_pair = get_mandatory_arg(args, "--keypair", "")
        self.username = "******"
        self.password = get_mandatory_arg(args, "--password", "")
        self.postgres_url = get_mandatory_arg(args, "--postgres-jdbc-url", "")
        self.postgres_user = get_mandatory_arg(args, "--postgres-user", "")
        self.postgres_pwd = get_mandatory_arg_no_print(args,
                                                       "--postgres-password",
                                                       "")
        self.node_counter = int(
            get_optional_arg(args, "--start-node-num-from", "", "1"))
        self.hosting = "aws"
        self.log_level = get_optional_arg(args, "--log-level", "", "info")
示例#4
0
    def __init__(self, args):
        self.run_id = str(uuid.uuid4())
        self.tags = get_mandatory_arg(args, "--tags", "")
        self.mode = get_optional_arg_validated(args, "--mode", "", ["benchmark","model"], "benchmark")
        self.config_count = int(get_optional_arg(args, "--config-count", "", "1"))
        self.new_instance_per_run = is_true(get_optional_arg(args, "--new-instance-per-run", "", "false"))
        self.no_destroy = is_true(get_optional_arg(args, "--no-destroy", "", "false"))
        self.no_deploy = is_true(get_optional_arg(args, "--no-deploy", "", "false"))
        self.restart_brokers = is_true(get_optional_arg(args, "--restart-brokers", "", "true"))
        self.run_tag = get_optional_arg(args, "--run-tag", "", "none")
        self.playlist_file = get_mandatory_arg(args, "--playlist-file", "")
        # note that for AWS, background load has been moved to playlists. TODO: do same for GCP
        self.background_policies_file = get_optional_arg(args, "--bg-policies-file", "", "none") # GCP only
        self.background_topology_file = get_optional_arg(args, "--bg-topology-file", "", "none") # GCP only
        self.background_delay = int(get_optional_arg(args, "--bg-delay-seconds", "", "0")) # GCP only
        self.background_step_seconds = int(get_optional_arg(args, "--bg-step-seconds", "", "0")) # GCP only
        self.background_step_repeat = int(get_optional_arg(args, "--bg-step-repeat", "", "0")) # GCP only
        self.gap_seconds = int(get_mandatory_arg(args, "--gap-seconds", ""))
        self.start_allowance_ms = int(get_optional_arg(args, "--start-allowance-seconds", "", "60"))
        self.repeat_count = int(get_optional_arg(args, "--repeat", "", "1"))
        self.parallel_count = int(get_optional_arg(args, "--parallel", "", "1"))
        self.override_step_seconds = int(get_optional_arg(args, "--override-step-seconds", "", "0"))
        self.override_step_repeat = int(get_optional_arg(args, "--override-step-repeat", "", "0"))
        self.override_step_msg_limit = int(get_optional_arg(args, "--override-step-msg-limit", "", "0"))
        self.override_broker_hosts = get_optional_arg(args, "--override-broker-hosts", "", "")
        self.federation_enabled = is_true(get_optional_arg(args, "--federation-enabled", "", "false"))
        self.attempts = get_optional_arg(args, "--attempts", "", "1")
        self.warmUpSeconds = get_optional_arg(args, "--warm-up-seconds", "", "0")


        # model mode only. Value: dataloss,duplicates,ordering,consumption,connectivity. Don't use ordering unless one consumer per queue.
        self.checks = get_optional_arg(args, "--checks", "", "dataloss,duplicates,connectivity")
        self.grace_period_sec = get_optional_arg(args, "--grace-period-sec", "", "60")
        
        self.username = "******"
        self.password = get_mandatory_arg(args, "--password", "")
        self.postgres_url = get_mandatory_arg(args, "--postgres-jdbc-url", "")
        self.postgres_user = get_mandatory_arg(args, "--postgres-user", "")
        self.postgres_pwd = get_mandatory_arg_no_print(args, "--postgres-password", "")
        self.node_counter = int(get_optional_arg(args, "--start-node-num-from", "", "1"))
        self.log_level = get_optional_arg(args, "--log-level", "", "info")
        self.influx_subpath = get_mandatory_arg(args, "--influx-subpath", "")
def main():
    print("quorum-queue-test.py")
    args = get_args(sys.argv)

    count = -1  # no limit
    tests = int(get_mandatory_arg(args, "--tests"))
    actions = int(get_mandatory_arg(args, "--actions"))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    queue = get_mandatory_arg(args, "--queue")
    sac_enabled = is_true(get_mandatory_arg(args, "--sac"))
    chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
    chaos_min_interval = int(
        get_optional_arg(args, "--chaos-min-interval", "30"))
    chaos_max_interval = int(
        get_optional_arg(args, "--chaos-max-interval", "120"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])

    for test_number in range(1, tests + 1):

        print("")
        console_out(
            f"TEST RUN: {str(test_number)} of {tests}--------------------------",
            "TEST RUNNER")
        setup_complete = False

        while not setup_complete:
            broker_manager = BrokerManager()
            broker_manager.deploy(cluster_size, True, rmq_version, False)
            initial_nodes = broker_manager.get_initial_nodes()

            console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

            print_mod = in_flight_max * 5
            queue_name = queue + "_" + str(test_number)

            mgmt_node = broker_manager.get_random_init_node()
            queue_created = False
            qc_ctr = 0
            while queue_created == False and qc_ctr < 20:
                qc_ctr += 1
                if sac_enabled:
                    queue_created = broker_manager.create_quorum_sac_queue(
                        mgmt_node, queue_name, cluster_size, 0)
                else:
                    queue_created = broker_manager.create_quorum_queue(
                        mgmt_node, queue_name, cluster_size, 0)

                if queue_created:
                    setup_complete = True
                else:
                    time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor("qqt", test_number, print_mod, True,
                                     False)
        publisher = RabbitPublisher(1, test_number, broker_manager,
                                    in_flight_max, 120, print_mod)
        publisher.configure_sequence_direct(queue_name, count, 0, 1)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER", False)
        consumer_manager.add_consumers(1, test_number, queue_name, prefetch)

        chaos = ChaosExecutor(initial_nodes)

        if chaos_mode == "partitions":
            chaos.only_partitions()
        elif chaos_mode == "nodes":
            chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

        pub_thread = threading.Thread(target=publisher.start_publishing)
        pub_thread.start()
        console_out("publisher started", "TEST RUNNER")

        for action_num in range(1, actions + 1):
            wait_sec = random.randint(chaos_min_interval, chaos_max_interval)
            console_out(f"waiting for {wait_sec} seconds before next action",
                        "TEST RUNNER")
            time.sleep(wait_sec)

            console_out(
                f"execute chaos action {str(action_num)}/{actions} of test {str(test_number)}",
                "TEST RUNNER")
            chaos.execute_chaos_action()
            subprocess.call(["bash", "../cluster/cluster-status.sh"])

        time.sleep(60)
        console_out("repairing cluster", "TEST RUNNER")
        chaos.repair()
        console_out("repaired cluster", "TEST RUNNER")

        publisher.stop_publishing()

        console_out("starting grace period for consumer to catch up",
                    "TEST RUNNER")
        ctr = 0

        while True:
            ms_since_last_msg = datetime.datetime.now(
            ) - msg_monitor.get_last_msg_time()
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count(
            ) and len(publisher.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            elif ctr > grace_period_sec and ms_since_last_msg.total_seconds(
            ) > 15:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")

        if len(lost_msgs) > 0:
            console_out(f"Lost messages count: {len(lost_msgs)}",
                        "TEST RUNNER")
            for msg in lost_msgs:
                console_out(f"Lost message: {msg}", "TEST RUNNER")

        console_out(
            f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")
        success = True

        if msg_monitor.get_out_of_order() == True:
            console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER")
            success = False

        if len(lost_msgs) > 0:
            console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER")
            success = False

        if success == True:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------",
                    "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            pub_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e),
                        "TEST RUNNER")

        console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
示例#6
0
def main():
    print("publish-consume.py")
    args = get_args(sys.argv)

    # cluster
    new_cluster = is_true(
        get_optional_arg_validated(args, "--new-cluster", "false",
                                   ["true", "false"]))
    if new_cluster:
        cluster_size = int(get_mandatory_arg(args, "--cluster-size"))
    else:
        cluster_size = int(get_optional_arg(args, "--cluster-size", "3"))

    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])

    # queues and exchanges
    exchanges = as_list(get_optional_arg(args, "--exchanges", ""))
    queue_name = get_mandatory_arg(args, "--queue")
    queue_type = get_optional_arg_validated(args, "--queue-type", "mirrored",
                                            ["mirrored", "quorum"])
    qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0"))
    rep_factor = int(get_optional_arg(args, "--rep-factor", str(cluster_size)))
    sac_enabled = is_true(
        get_optional_arg_validated(args, "--sac", "false", ["true", "false"]))

    if rmq_version == "3.7":
        if sac_enabled:
            console_out("Cannot use SAC mode with RabbitMQ 3.7", "TEST RUNNER")
            exit(1)

        if queue_type == "quorum":
            console_out("Cannot use quorum queues with RabbitMQ 3.7",
                        "TEST RUNNER")
            exit(1)

    # publisher
    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    pub_mode = get_optional_arg_validated(args, "--pub-mode", "direct",
                                          ["direct", "exchange"])
    msg_mode = get_optional_arg_validated(
        args, "--msg-mode", "sequence",
        ["sequence", "partitioned-sequence", "large-msgs", "hello"])
    count = int(get_mandatory_arg(args, "--msgs"))
    dup_rate = float(get_optional_arg(args, "--dup-rate", "0"))
    sequence_count = int(get_optional_arg(args, "--sequences", 1))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))

    # consumers
    consumer_count = int(get_optional_arg(args, "--consumers", "1"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    analyze = is_true(
        get_optional_arg_validated(args, "--analyze", "true",
                                   ["true", "false"]))

    print_mod = get_optional_arg(args, "--print-mod", in_flight_max * 5)

    broker_manager = BrokerManager()
    broker_manager.deploy(cluster_size, new_cluster, rmq_version, False)

    mgmt_node = broker_manager.get_random_init_node()
    queue_created = False
    while queue_created == False:
        if queue_type == "mirrored":
            if sac_enabled:
                queue_created = broker_manager.create_standard_sac_queue(
                    mgmt_node, queue_name, rep_factor)
            else:
                queue_created = broker_manager.create_standard_queue(
                    mgmt_node, queue_name, rep_factor)
        elif queue_type == "quorum":
            if sac_enabled:
                queue_created = broker_manager.create_quorum_sac_queue(
                    mgmt_node, queue_name, rep_factor, qq_max_length)
            else:
                queue_created = broker_manager.create_quorum_queue(
                    mgmt_node, queue_name, rep_factor, qq_max_length)

        if queue_created == False:
            time.sleep(5)

    broker_manager.declare_exchanges(queue_name, exchanges)

    time.sleep(10)

    if consumer_count > 0:
        msg_monitor = MessageMonitor("pub-con", 1, print_mod, analyze, False)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER", False)
        consumer_manager.add_consumers(consumer_count, 1, queue_name, prefetch)

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

    if publisher_count > 0:
        pub_manager = PublisherManager(broker_manager, 1, "TEST RUNNER",
                                       publisher_count, in_flight_max,
                                       print_mod)

        if pub_mode == "direct":
            if msg_mode == "sequence":
                pub_manager.add_sequence_direct_publishers(
                    queue_name, count, dup_rate, sequence_count)
            elif pub_mode == "partitioned-sequence":
                print("Cannot use partitioned sequence mode with direct mode")
                exit(1)
            elif pub_mode == "large-msgs":
                msg_size = int(get_mandatory_arg(args, "--msg-size"))
                pub_manager.add_large_msgs_direct_publishers(
                    queue_name, count, dup_rate, msg_size)
            else:
                pub_manager.add_hello_msgs_direct_publishers(
                    queue_name, count, dup_rate)
        elif pub_mode == "exchange":
            if len(exchanges) == 0:
                console_out("No exchanges provided", "TEST RUNNER")
                exit(1)

            if msg_mode == "sequence":
                pub_manager.add_sequence_to_exchanges_publishers(
                    exchanges, "", count, dup_rate, sequence_count)
            elif msg_mode == "partitioned-sequence":
                pub_manager.add_partitioned_sequence_to_exchanges_publishers(
                    exchanges, count, dup_rate, sequence_count)
            elif msg_mode == "large-msgs":
                msg_size = int(get_mandatory_arg(args, "--msg-size"))
                pub_manager.add_large_msgs_to_exchanges_publishers(
                    exchanges, "", count, dup_rate, msg_size)
            else:
                pub_manager.add_hello_msgs_to_exchanges_publishers(
                    exchanges, "", count, dup_rate)

        pub_manager.start_publishers()

    while True:
        try:
            console_out(
                "Press + to add a consumer, - to remove a consumer, ! to remove the active consumer (SAC only)",
                "TEST_RUNNER")
            input_str = input()
            if input_str == "+":
                consumer_manager.add_consumer_and_start_consumer(
                    1, queue_name, prefetch)
            elif input_str == "-":
                consumer_manager.stop_and_remove_oldest_consumer()
            else:
                consumer_manager.stop_and_remove_specfic_consumer(input_str)
        except KeyboardInterrupt:
            if publisher_count > 0:
                console_out(
                    "Stopping publishers. Starting grace period for consumers to catch up.",
                    "TEST_RUNNER")
                pub_manager.stop_all_publishers()
            break

    if publisher_count > 0 and consumer_count > 0:
        try:
            ctr = 0
            while ctr < 300:
                if msg_monitor.get_unique_count(
                ) >= pub_manager.get_total_pos_ack_count() and len(
                        pub_manager.get_total_msg_set().difference(
                            msg_monitor.get_msg_set())) == 0:
                    break
                time.sleep(1)
                ctr += 1
        except KeyboardInterrupt:
            console_out("Grace period ended", "TEST RUNNER")

        confirmed_set = pub_manager.get_total_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {pub_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

        if analyze:
            success = True
            if len(lost_msgs) > 0:
                console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}",
                            "TEST RUNNER")
                success = False

            if msg_monitor.get_out_of_order() == True:
                success = False
                console_out(f"FAILED TEST: Received out-of-order messages",
                            "TEST RUNNER")

            if success:
                console_out("TEST OK", "TEST RUNNER")

    elif publisher_count > 0:
        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {pub_manager.get_total_pos_ack_count()}",
            "TEST RUNNER")
    elif consumer_count > 0:
        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

    console_out("RESULTS END------------------------------------",
                "TEST RUNNER")

    try:
        if consumer_count > 0:
            consumer_manager.stop_all_consumers()
            msg_monitor.stop_consuming()
            monitor_thread.join(10)
    except Exception as e:
        console_out("Failed to clean up test correctly: " + str(e),
                    "TEST RUNNER")

    console_out(f"TEST 1 COMPLETE", "TEST RUNNER")
示例#7
0
def main():
    print("random-test.py")
    #signal.signal(signal.SIGINT, interuppt_handler)
    args = get_args(sys.argv)

    count = -1  # no limit
    test_name = get_mandatory_arg(args, "--test-name")
    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    queue = get_mandatory_arg(args, "--queue")
    queue_type = get_mandatory_arg(args, "--queue-type")
    analyze = is_true(get_optional_arg(args, "--analyze", "true"))

    if queue_type == "quorum":
        qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0"))

    sac_enabled = is_true(get_mandatory_arg(args, "--sac"))
    log_messages = is_true(get_optional_arg(args, "--log-msgs", "false"))

    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    if publisher_count > 0:
        in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10"))
        print_mod = int(
            get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}"))
        sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    else:
        print_mod = int(get_optional_arg(args, "--print-mod", f"1000"))

    new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])
    stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash",
                                           ["crash", "close", "cancel"])

    use_toxiproxy = False
    consumer_hard_close = False
    if stop_mode == "crash":
        use_toxiproxy = True
    elif stop_mode == "close":
        consumer_hard_close = True

    include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true"))
    if include_chaos:
        chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
        chaos_min_interval = int(
            get_optional_arg(args, "--chaos-min-interval", "60"))
        chaos_max_interval = int(
            get_optional_arg(args, "--chaos-max-interval", "120"))

    include_con_actions = is_true(
        get_optional_arg(args, "--consumer-actions", "true"))
    if include_con_actions:
        con_action_min_interval = int(
            get_optional_arg(args, "--consumer-min-interval", "20"))
        con_action_max_interval = int(
            get_optional_arg(args, "--consumer-max-interval", "60"))

    failed_test_log = list()
    failed_tests = set()

    for test_number in range(tests):

        print("")
        subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"])
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        broker_manager = BrokerManager()
        broker_manager.deploy(cluster_size, new_cluster, rmq_version,
                              use_toxiproxy)
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        queue_name = queue + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False

        while queue_created == False:
            if queue_type == "mirrored":
                if sac_enabled:
                    queue_created = broker_manager.create_standard_sac_queue(
                        mgmt_node, queue_name, cluster_size)
                else:
                    queue_created = broker_manager.create_standard_queue(
                        mgmt_node, queue_name, cluster_size)
            elif queue_type == "quorum":
                if sac_enabled:
                    queue_created = broker_manager.create_quorum_sac_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)
                else:
                    queue_created = broker_manager.create_quorum_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)

            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(test_name, test_number, print_mod,
                                     analyze, log_messages)
        chaos = ChaosExecutor(initial_nodes)

        if include_chaos:
            if chaos_mode == "partitions":
                chaos.only_partitions()
            elif chaos_mode == "nodes":
                chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        if consumer_count > 0:
            consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                               "TEST RUNNER", use_toxiproxy)
            consumer_manager.add_consumers(consumer_count, test_number,
                                           queue_name, prefetch)
            consumer_manager.start_consumers()

        if publisher_count == 1:
            publisher = RabbitPublisher(1, test_number, broker_manager,
                                        in_flight_max, 120, print_mod)
            publisher.configure_sequence_direct(queue_name, count, 0,
                                                sequence_count)

            pub_thread = threading.Thread(target=publisher.start_publishing)
            pub_thread.start()
            console_out("publisher started", "TEST RUNNER")

        if include_con_actions or include_chaos:
            init_wait_sec = 20
            console_out(
                f"Will start chaos and consumer actions in {init_wait_sec} seconds",
                "TEST RUNNER")
            time.sleep(init_wait_sec)

        if include_chaos:
            chaos_thread = threading.Thread(
                target=chaos.start_random_single_action_and_repair,
                args=(chaos_min_interval, chaos_max_interval))
            chaos_thread.start()
            console_out("Chaos executor started", "TEST RUNNER")

        if include_con_actions:
            consumer_action_thread = threading.Thread(
                target=consumer_manager.start_random_consumer_actions,
                args=(con_action_min_interval, con_action_max_interval,
                      consumer_hard_close))
            consumer_action_thread.start()
            console_out("Consumer actions started", "TEST RUNNER")

        ctr = 0
        run_seconds = run_minutes * 60
        while ctr < run_seconds and not stop_please:
            try:
                time.sleep(1)
                ctr += 1

                if ctr % 60 == 0:
                    console_out(
                        f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left",
                        "TEST RUNNER")
            except KeyboardInterrupt:
                console_out(
                    f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)",
                    "TEST RUNNER")
                break

        try:
            chaos.stop_random_single_action_and_repair()

            if consumer_count > 0:
                consumer_manager.stop_random_consumer_actions()

            if include_chaos:
                chaos_thread.join(30)

            if include_con_actions:
                consumer_action_thread.join(30)
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e),
                        "TEST RUNNER")

        if publisher_count > 0:
            publisher.stop_publishing()

        if consumer_count > 0:
            console_out("Resuming consumers", "TEST RUNNER")
            consumer_manager.resume_all_consumers()

            console_out("Starting grace period for consumer to catch up",
                        "TEST RUNNER")
            ctr = 0

            try:
                while ctr < grace_period_sec:
                    if publisher_count > 0 and msg_monitor.get_unique_count(
                    ) >= publisher.get_pos_ack_count() and len(
                            publisher.get_msg_set().difference(
                                msg_monitor.get_msg_set())) == 0:
                        break
                    time.sleep(1)
                    ctr += 1
            except KeyboardInterrupt:
                console_out("Grace period ended", "TEST RUNNER")

        console_out("RESULTS ----------------------------------------",
                    "TEST RUNNER")
        if publisher_count > 0:
            confirmed_set = publisher.get_msg_set()
            not_consumed_msgs = confirmed_set.difference(
                msg_monitor.get_msg_set())
            console_out(
                f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")
        else:
            not_consumed_msgs = set()
            console_out(
                f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")

        success = True
        if consumer_count > 0:
            if len(not_consumed_msgs) > 0:
                if sac_enabled:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                else:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages."
                )
                failed_tests.add(test_number)

                lost_ctr = 0
                sorted_msgs = list(not_consumed_msgs)
                sorted_msgs.sort()
                for msg in sorted_msgs:
                    console_out(f"Lost? {msg}", "TEST RUNNER")
                    lost_ctr += 1
                    if lost_ctr > 500:
                        console_out("More than 500, truncated list",
                                    "TEST RUNNER")
                        break

                success = False

            if msg_monitor.get_out_of_order() == True:
                success = False
                console_out(f"FAILED TEST: Received out-of-order messages",
                            "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Received out-of-order messages"
                )
                failed_tests.add(test_number)

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END ------------------------------------",
                    "TEST RUNNER")

        try:
            if consumer_count > 0:
                consumer_manager.stop_all_consumers()

            if publisher_count == 1:
                pub_thread.join(30)
            msg_monitor.stop_consuming()
            monitor_thread.join(30)
        except Exception as e:
            console_out_exception("Failed to clean up test correctly.", e,
                                  "TEST RUNNER")

        broker_manager.zip_log_files(test_name, test_number)
        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")

    console_out("", "TEST RUNNER")
    console_out("SUMMARY", "TEST RUNNER")
    console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}",
                "TEST RUNNER")
    for line in failed_test_log:
        console_out(line, "TEST RUNNER")

    console_out("TEST RUN COMPLETE", "TEST RUNNER")