class TestPfcwdWb(SetupPfcwdFunc): """ Test PFCwd warm-reboot function and supporting methods """ def storm_detect_path(self, port, queue, first_detect_after_wb=False): """ Storm detection action and associated verifications Args: port(string) : DUT port queue(int): queue on the port that will be stormed first_detect_after_wb(bool): first detect iteration after warm reboot (default: False) """ # for the first iteration after wb, do not write a marker to the log but specify the start msg from # where to search the logs start_marker = None if first_detect_after_wb: start_marker = "NOTICE swss#orchagent: :- setWarmStartState: orchagent warm start state changed to initialized" self.loganalyzer = LogAnalyzer(ansible_host=self.dut, marker_prefix="pfcwd_wb_storm_detect_port_{}_queue_{}".format(port, queue), start_marker=start_marker) marker = self.loganalyzer.init() time.sleep(5) ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages") reg_exp = self.loganalyzer.parse_regexp_file(src=ignore_file) self.loganalyzer.ignore_regex.extend(reg_exp) self.loganalyzer.expect_regex = [] self.loganalyzer.expect_regex.extend([EXPECT_PFC_WD_DETECT_RE]) self.loganalyzer.match_regex = [] # ongoing storm. no need to start a new one if not first_detect_after_wb: if not self.pfc_wd['fake_storm']: self.storm_handle[port][queue].start_storm() time.sleep(15 * len(self.pfc_wd['queue_indices'])) else: PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "enabled") time.sleep(5) else: # for the first iteration after wb, check the log for detect msgs for the ongoing storms self.loganalyzer.expected_matches_target = len(self.ports) * len(self.pfc_wd['queue_indices']) time.sleep(20) # storm detect check logger.info("Verify if PFC storm is detected on port {} queue {}".format(port, queue)) self.loganalyzer.analyze(marker) def storm_restore_path(self, port, queue): """ Storm restoration action and associated verifications Args: port(string) : DUT port queue(int): queue on the port where storm would be restored """ marker = self.loganalyzer.update_marker_prefix("pfcwd_wb_storm_restore_port_{}_queue_{}".format(port, queue)) time.sleep(5) ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages") reg_exp = self.loganalyzer.parse_regexp_file(src=ignore_file) self.loganalyzer.ignore_regex.extend(reg_exp) self.loganalyzer.expect_regex = [] self.loganalyzer.expect_regex.extend([EXPECT_PFC_WD_RESTORE_RE]) self.loganalyzer.match_regex = [] self.loganalyzer.expected_matches_target = 0 if not self.pfc_wd['fake_storm']: self.storm_handle[port][queue].stop_storm() time.sleep(15) else: PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled") time.sleep(5) # storm restore check logger.info("Verify if PFC storm is restored on port {}".format(port)) self.loganalyzer.analyze(marker) def defer_fake_storm(self, port, queue, start_defer, stop_defer): time.sleep(start_defer) DUT_ACTIVE.wait() PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "enabled") time.sleep(stop_defer) DUT_ACTIVE.wait() PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled") def run_test(self, port, queue, detect=True, storm_start=True, first_detect_after_wb=False, storm_defer=False): """ Test method that invokes the storm detection and restoration path which includes the traffic test Args: port(string) : DUT port queue(int): queue on the port which would be stormed/restored detect(bool): if the detect logic needs to be called (default: True) storm_start(bool): used to decide certain actions in the detect logic (default: True) first_detect_after_wb(bool): used to decide certain actions in the detect logic (default: False) storm_defer(bool): use the storm defer logic or not (default: False) """ # for deferred storm, return to main loop for next action which is warm boot if storm_defer: if not self.pfc_wd['fake_storm']: self.storm_handle[port][queue].start_storm() self.storm_handle[port][queue].stop_storm() else: thread = InterruptableThread( target=self.defer_fake_storm, args=(port, queue, self.pfc_wd['storm_start_defer'], self.pfc_wd['storm_stop_defer'])) thread.daemon = True thread.start() self.storm_threads.append(thread) return if detect: if storm_start or first_detect_after_wb: logger.info("--- Storm detection path for port {} queue {} ---".format(port, queue)) self.storm_detect_path(port, queue, first_detect_after_wb=first_detect_after_wb) else: logger.info("--- Storm restoration path for port {} queue {} ---".format(port, queue)) self.storm_restore_path(port, queue) # test pfcwd functionality on a storm/restore self.traffic_inst.verify_wd_func(detect=detect) @pytest.fixture(autouse=True) def pfcwd_wb_test_cleanup(self): """ Cleanup method """ yield # stop all threads that might stuck in wait DUT_ACTIVE.set() for thread in self.storm_threads: thread_exception = thread.join(timeout=0.1, suppress_exception=True) if thread_exception: logger.debug("Exception in thread %r:", thread) logger.debug( "".join(traceback.format_exception(*thread_exception)) ) self.stop_all_storm() time.sleep(5) logger.info("--- Stop PFC WD ---") self.dut.command("pfcwd stop") def stop_all_storm(self): """ Stop all the storms after each test run """ if self.storm_handle: logger.info("--- Stopping storm on all ports ---") for port in self.storm_handle: for queue in self.storm_handle[port]: if self.storm_handle[port][queue]: logger.info("--- Stop pfc storm on port {} queue {}".format(port, queue)) self.storm_handle[port][queue].stop_storm() else: logger.info("--- Disabling fake storm on port {} queue {}".format(port, queue)) PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled") def pfcwd_wb_helper(self, fake_storm, testcase_actions, setup_pfc_test, fanout_graph_facts, ptfhost, duthost, localhost, fanouthosts, two_queues): """ Helper method that initializes the vars and starts the test execution Args: fake_storm(bool): if fake storm is enabled or disabled testcase_actions(list): list of actions that the test will go through setup_pfc_test(fixture): module scoped autouse fixture fanout_graph_facts(fixture): fanout info ptfhost(AnsibleHost): PTF instance duthost(AnsibleHost): DUT instance localhost(AnsibleHost): local instance fanouthosts(AnsibleHost): fanout instance """ setup_info = setup_pfc_test self.fanout_info = fanout_graph_facts self.ptf = ptfhost self.dut = duthost self.fanout = fanouthosts self.timers = setup_info['pfc_timers'] self.ports = setup_info['selected_test_ports'] self.neighbors = setup_info['neighbors'] dut_facts = self.dut.facts self.peer_dev_list = dict() self.seed = int(datetime.datetime.today().day) self.two_queues = two_queues self.storm_handle = dict() bitmask = 0 storm_deferred = 0 storm_restored = 0 self.max_wait = 0 self.fake_storm = fake_storm self.oid_map = dict() self.storm_threads = [] for t_idx, test_action in enumerate(testcase_actions): if 'warm-reboot' in test_action: reboot(self.dut, localhost, reboot_type="warm") continue # one of the factors to decide if the storm needs to be started storm_restored = bitmask and (bitmask & 2) # if the action prior to the warm-reboot was a 'storm_defer', ensure that all the storms are # stopped storm_deferred = bitmask and (bitmask & 4) if storm_deferred: logger.info("Wait for all the deferred storms to start and stop ...") join_all(self.storm_threads, self.max_wait) self.storm_threads = [] self.storm_handle = dict() bitmask = (1 << ACTIONS[test_action]) for p_idx, port in enumerate(self.ports): logger.info("") logger.info("--- Testing on {} ---".format(port)) self.setup_test_params(port, setup_info['vlan'], p_idx) for q_idx, queue in enumerate(self.pfc_wd['queue_indices']): if not t_idx or storm_deferred: if not q_idx: self.storm_handle[port] = dict() self.storm_handle[port][queue] = None # setup the defer parameters if the storm is deferred currently if (bitmask & 4): self.storm_defer_setup() if not self.pfc_wd['fake_storm']: self.storm_setup(port, queue, storm_defer=(bitmask & 4)) else: self.oid_map[(port, queue)] = PfcCmd.get_queue_oid(self.dut, port, queue) self.traffic_inst = SendVerifyTraffic(self.ptf, dut_facts['router_mac'], self.pfc_wd, queue) self.run_test(port, queue, detect=(bitmask & 1), storm_start=not t_idx or storm_deferred or storm_restored, first_detect_after_wb=(t_idx == 2 and not p_idx and not q_idx and not storm_deferred), storm_defer=(bitmask & 4)) @pytest.fixture(params=['no_storm', 'storm', 'async_storm']) def testcase_action(self, request): """ Parameters to invoke the pfcwd warm boot test Args: request(pytest) : pytest request object Yields: testcase_action(string) : testcase to execute """ yield request.param def test_pfcwd_wb(self, fake_storm, testcase_action, setup_pfc_test, fanout_graph_facts, ptfhost, duthosts, rand_one_dut_hostname, localhost, fanouthosts, two_queues): """ Tests PFCwd warm reboot with various testcase actions Args: fake_storm(fixture): fake storm status testcase_action(fixture): testcase to execute (values: 'no_storm', 'storm', 'async_storm') 'no_storm' : PFCwd storm detection/restore before and after warm reboot 'storm' : PFC storm started and detected before warm-reboot. Storm is ongoing during warm boot and lasts past the warm boot finish. Verifies if the storm is detected after warm-reboot. PFC storm is stopped and 465 restored after warm boot 'async_storm': PFC storm asynchronously starts at a random time and lasts a random period at fanout. Warm reboot is done. Wait for all the storms to finish and then verify the storm detect/restore logic setup_pfc_test(fixture) : Module scoped autouse fixture for PFCwd fanout_graph_facts(fixture) : fanout graph info ptfhost(AnsibleHost) : ptf host instance duthost(AnsibleHost) : DUT instance localhost(AnsibleHost) : localhost instance fanouthosts(AnsibleHost): fanout instance """ duthost = duthosts[rand_one_dut_hostname] logger.info("--- {} ---".format(TESTCASE_INFO[testcase_action]['desc'])) self.pfcwd_wb_helper(fake_storm, TESTCASE_INFO[testcase_action]['test_sequence'], setup_pfc_test, fanout_graph_facts, ptfhost, duthost, localhost, fanouthosts, two_queues)
def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_syntax_enabled): """Invokes the 'stress' utility to consume memory more than the threshold asynchronously and checks whether the container can be stopped and restarted. After container was restarted, 'stress' utility will be invoked again to consume memory and checks whether Monit was able to restart this container with or without help of new syntax. Loganalyzer is leveraged to check whether the log messages related to container stopped and started were generated. Args: duthost: The AnsibleHost object of DuT. container_name: Name of container. vm_workers: Number of workers which does the spinning on malloc()/free() to consume memory. new_syntax_enabled: Checks to make sure container will be restarted if it is set to be `True`. Returns: None. """ expected_alerting_messages = [] expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*") expected_alerting_messages.append(".*Stopping Telemetry container.*") expected_alerting_messages.append(".*Stopped Telemetry container.*") expected_alerting_messages.append(".*Starting Telemetry container.*") expected_alerting_messages.append(".*Started Telemetry container.*") loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="test_memory_checker") loganalyzer.expect_regex = [] loganalyzer.expect_regex.extend(expected_alerting_messages) marker = loganalyzer.init() thread_pool = ThreadPool() thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) logger.info("Sleep '{}' seconds to wait for the alerting messages from syslog ...".format(WAITING_SYSLOG_MSG_SECS)) time.sleep(WAITING_SYSLOG_MSG_SECS) logger.info("Checking the alerting messages related to container restart ...") loganalyzer.analyze(marker) logger.info("Found all the expected alerting messages from syslog!") logger.info("Waiting for '{}' container to be restarted ...".format(container_name)) restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS, CONTAINER_CHECK_INTERVAL_SECS, 0, check_container_state, duthost, container_name, True) pytest_assert(restarted, "Failed to restart '{}' container!".format(container_name)) logger.info("'{}' container is restarted.".format(container_name)) logger.info("Running 'stress' utility again in '{}' ...".format(container_name)) thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers)) check_counter = 0 marker = loganalyzer.update_marker_prefix("test_monit_counter") logger.info("Checking memory usage of '{}' every 30 seconds for 6 times ...".format(container_name)) while check_counter < 6: check_counter += 1 mem_usage = get_container_mem_usage(duthost, container_name) logger.info("Memory usage of '{}' is '{}'".format(container_name, mem_usage)) time.sleep(30) logger.info("Analyzing syslog messages to verify whether '{}' is restarted ...".format(container_name)) analyzing_result = loganalyzer.analyze(marker, fail=False) if not new_syntax_enabled: pytest_assert(analyzing_result["total"]["expected_match"] == 0, "Monit can reset counter and restart '{}'!".format(container_name)) logger.info("Monit was unable to reset its counter and '{}' can not be restarted!".format(container_name)) else: pytest_assert(analyzing_result["total"]["expected_match"] == len(expected_alerting_messages), "Monit still can not restart '{}' with the help of new syntax!".format(container_name)) logger.info("Monit was able to restart '{}' with the help of new syntax!".format(container_name))