Exemplo n.º 1
0
def check_not_implemented_warnings(duthosts, enum_rand_one_per_hwsku_hostname):
    duthost = duthosts[enum_rand_one_per_hwsku_hostname]

    loganalyzer = LogAnalyzer(ansible_host=duthost,
                                  marker_prefix="platformapi_test")
    marker = loganalyzer.init()
    yield
    loganalyzer.match_regex.extend(['WARNING pmon#platform_api_server.py: API.+not implemented'])
    loganalyzer.analyze(marker)
Exemplo n.º 2
0
def disable_analyzer_for_mellanox(duthost):
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='sfp_cfg')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("kernel.*Eeprom query failed*")
        marker = loganalyzer.init()
    yield

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
def ignore_particular_error_log(request, duthosts, rand_one_dut_hostname):
    duthost = duthosts[rand_one_dut_hostname]
    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='turn_on_off_psu_and_check_psustatus')
    loganalyzer.load_common_config()

    ignore_list = request.param
    loganalyzer.ignore_regex.extend(ignore_list)
    marker = loganalyzer.init()

    yield

    loganalyzer.analyze(marker)
def test_monitoring_critical_processes(duthosts, rand_one_dut_hostname, tbinfo):
    """Tests the feature of monitoring critical processes with Supervisord.

    This function will check whether names of critical processes will appear
    in the syslog if the autorestart were disabled and these critical processes
    were stopped.

    Args:
        duthosts: list of DUTs.
        rand_one_dut_hostname: hostname of DUT.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    duthost = duthosts[rand_one_dut_hostname]
    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="monitoring_critical_processes")
    loganalyzer.expect_regex = []
    bgp_neighbors = duthost.get_bgp_neighbors()
    up_bgp_neighbors = [ k.lower() for k, v in bgp_neighbors.items() if v["state"] == "established" ]

    skip_containers = []
    skip_containers.append("database")
    skip_containers.append("gbsyncd")
    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")

    containers_in_namespaces = get_containers_namespace_ids(duthost, skip_containers)

    expected_alerting_messages = get_expected_alerting_messages(duthost, containers_in_namespaces)
    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    stop_critical_processes(duthost, containers_in_namespaces)

    # Wait for 70 seconds such that Supervisord has a chance to write alerting message into syslog.
    logger.info("Sleep 70 seconds to wait for the alerting message...")
    time.sleep(70)

    logger.info("Checking the alerting messages from syslog...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")

    logger.info("Executing the config reload...")
    config_reload(duthost)
    logger.info("Executing the config reload was done!")

    ensure_all_critical_processes_running(duthost, containers_in_namespaces)

    if not postcheck_critical_processes_status(duthost, up_bgp_neighbors):
        pytest.fail("Post-check failed after testing the container checker!")
    logger.info("Post-checking status of critical processes and BGP sessions was done!")
Exemplo n.º 5
0
def ignore_log_analyzer_by_vendor(request, duthosts, enum_rand_one_per_hwsku_hostname):
    duthost = duthosts[enum_rand_one_per_hwsku_hostname]
    asic_type = duthost.facts["asic_type"]
    ignore_asic_list = request.param
    if asic_type not in ignore_asic_list:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix=request.node.name)
        loganalyzer.load_common_config()
        marker = loganalyzer.init()
        yield
        loganalyzer.analyze(marker)
    else:
        yield
Exemplo n.º 6
0
    def storm_detect_path(self, dut, port, action):
        """
        Storm detection action and associated verifications

        Args:
            dut(AnsibleHost) : DUT instance
            port(string) : DUT port
            action(string) : PTF test action

        Returns:
            loganalyzer(Loganalyzer) : instance
        """
        restore_time = self.timers['pfc_wd_restore_time_large']
        detect_time = self.timers['pfc_wd_detect_time']

        loganalyzer = LogAnalyzer(
            ansible_host=self.dut,
            marker_prefix="pfc_function_storm_detect_{}_port_{}".format(
                action, port))
        marker = loganalyzer.init()
        ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages")
        reg_exp = loganalyzer.parse_regexp_file(src=ignore_file)
        loganalyzer.ignore_regex.extend(reg_exp)
        loganalyzer.expect_regex = []
        loganalyzer.expect_regex.extend([EXPECT_PFC_WD_DETECT_RE])
        loganalyzer.match_regex = []

        if action != "dontcare":
            start_wd_on_ports(dut, port, restore_time, detect_time, action)

        if not self.pfc_wd['fake_storm']:
            self.storm_hndle.start_storm()

        if action == "dontcare":
            self.traffic_inst.fill_buffer()
            start_wd_on_ports(dut, port, restore_time, detect_time, "drop")

        # placing this here to cover all action types. for 'dontcare' action, wd is started much later after the pfc storm is started
        if self.pfc_wd['fake_storm']:
            PfcCmd.set_storm_status(dut, self.queue_oid, "enabled")

        time.sleep(5)

        # storm detect
        logger.info("Verify if PFC storm is detected on port {}".format(port))
        loganalyzer.analyze(marker)
        self.stats.get_pkt_cnts(self.queue_oid, begin=True)
        # test pfcwd functionality on a storm
        self.traffic_inst.verify_wd_func(
            action if action != "dontcare" else "drop")
        return loganalyzer
Exemplo n.º 7
0
def disable_analyzer_for_mellanox(duthost):
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost,
                                  marker_prefix='sfp_cfg')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("kernel.*Eeprom query failed*")
        # Ignore PMPE error https://github.com/Azure/sonic-buildimage/issues/7163
        loganalyzer.ignore_regex.append(
            r".*ERR pmon#xcvrd: Receive PMPE error event on module.*")
        marker = loganalyzer.init()
    yield

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
Exemplo n.º 8
0
def check_syslog(duthost, prefix, trigger_action, expected_log,
                 restore_action):
    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix=prefix)
    loganalyzer.expect_regex = [expected_log]

    try:
        marker = loganalyzer.init()
        duthost.command(trigger_action)
        logger.info("Check for expected log {} in syslog".format(expected_log))
        loganalyzer.analyze(marker)

    except LogAnalyzerError as err:
        logger.error("Unable to find expected log in syslog")
        raise err

    finally:
        duthost.command(restore_action)
Exemplo n.º 9
0
def test_container_checker(duthosts, rand_one_dut_hostname, tbinfo):
    """Tests the feature of container checker.

    This function will check whether the container names will appear in the Monit
    alerting message if they are stopped explicitly or they hit start limitation.

    Args:
        duthosts: list of DUTs.
        rand_one_dut_hostname: hostname of DUT.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    duthost = duthosts[rand_one_dut_hostname]
    loganalyzer = LogAnalyzer(ansible_host=duthost,
                              marker_prefix="container_checker")
    loganalyzer.expect_regex = []

    container_autorestart_states = duthost.get_container_autorestart_states()
    disabled_containers = get_disabled_container_list(duthost)

    skip_containers = disabled_containers[:]
    skip_containers.append("gbsyncd")
    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")

    stopped_container_list = stop_containers(duthost,
                                             container_autorestart_states,
                                             skip_containers)
    pytest_assert(
        len(stopped_container_list) > 0, "None of containers was stopped!")

    expected_alerting_messages = get_expected_alerting_messages(
        stopped_container_list)
    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    # Wait for 2 minutes such that Monit has a chance to write alerting message into syslog.
    logger.info("Sleep 2 minutes to wait for the alerting message...")
    time.sleep(130)

    logger.info("Checking the alerting messages from syslog...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")
Exemplo n.º 10
0
def consumes_memory_and_checks_container_restart(duthost, container_name, vm_workers):
    """Invokes the 'stress' utility to consume memory more than the threshold asynchronously
    and checks whether the container can be stopped and restarted. Loganalyzer is leveraged
    to check whether the log messages related to container stopped were generated.

    Args:
        duthost: The AnsibleHost object of DuT.
        container_name: A string represents the name of container.
        vm_workers: Number of workers which does the spinning on malloc()/free()
          to consume memory.

    Returns:
        None.
    """
    expected_alerting_messages = []
    expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*")
    expected_alerting_messages.append(".*Stopping Telemetry container.*")
    expected_alerting_messages.append(".*Stopped Telemetry container.*")
    expected_alerting_messages.append(".*Starting Telemetry container.*")
    expected_alerting_messages.append(".*Started Telemetry container.*")

    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="container_restart_due_to_memory")
    loganalyzer.expect_regex = []
    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    thread_pool = ThreadPool()
    thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers))

    logger.info("Sleep '{}' seconds to wait for the alerting messages from syslog ...".format(WAITING_SYSLOG_MSG_SECS))
    time.sleep(WAITING_SYSLOG_MSG_SECS)

    logger.info("Checking the alerting messages related to container stopped ...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")

    logger.info("Waiting for '{}' container to be restarted ...".format(container_name))
    restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
                           CONTAINER_CHECK_INTERVAL_SECS,
                           0,
                           check_container_state, duthost, container_name, True)
    pytest_assert(restarted, "Failed to restart '{}' container!".format(container_name))
    logger.info("'{}' container is restarted.".format(container_name))
Exemplo n.º 11
0
    def test_pfcwd_port_toggle(self, request, fake_storm, setup_pfc_test, fanout_graph_facts, tbinfo, ptfhost, duthosts, rand_one_dut_hostname, fanouthosts):
        """
        Test PfCWD functionality after toggling port

        Test verifies the following:
            1. Select the port and lossless queue
            2. Start PFCWD on selected test port
            3. Start PFC storm on selected test port and lossless queue
            4. Verify that PFC storm is detected
            5. Stop PFC storm on selected test port and lossless queue
            6. Verify that PFC storm is restored
            7. Toggle test port (put administrativelly down and then up)
            8. Verify that PFC storm is not detected

        Args:
            request(object) : pytest request object
            fake_storm(fixture) : Module scoped fixture for enable/disable fake storm
            setup_pfc_test(fixture) : Module scoped autouse fixture for PFCWD
            fanout_graph_facts(fixture) : Fanout graph info
            tbinfo(fixture) : Testbed info
            ptfhost(AnsibleHost) : PTF host instance
            duthost(AnsibleHost) : DUT instance
            fanouthosts(AnsibleHost): Fanout instance
        """
        duthost = duthosts[rand_one_dut_hostname]
        setup_info = setup_pfc_test
        self.fanout_info = fanout_graph_facts
        self.ptf = ptfhost
        self.dut = duthost
        self.fanout = fanouthosts
        self.timers = setup_info['pfc_timers']
        self.ports = setup_info['selected_test_ports']
        self.neighbors = setup_info['neighbors']
        dut_facts = self.dut.facts
        self.peer_dev_list = dict()
        self.fake_storm = fake_storm
        self.storm_hndle = None
        action = "dontcare"

        for idx, port in enumerate(self.ports):
             logger.info("")
             logger.info("--- Testing port toggling with PFCWD enabled on {} ---".format(port))
             self.setup_test_params(port, setup_info['vlan'], init=not idx)
             self.traffic_inst = SendVerifyTraffic(self.ptf, dut_facts['router_mac'], self.pfc_wd)
             pfc_wd_restore_time_large = request.config.getoption("--restore-time")
             # wait time before we check the logs for the 'restore' signature. 'pfc_wd_restore_time_large' is in ms.
             self.timers['pfc_wd_wait_for_restore_time'] = int(pfc_wd_restore_time_large / 1000 * 2)

             try:
                 # Verify that PFC storm is detected and restored
                 self.stats = PfcPktCntrs(self.dut, action)
                 logger.info("{} on port {}".format(WD_ACTION_MSG_PFX[action], port))
                 self.run_test(self.dut, port, action)

                 # Toggle test port and verify that PFC storm is not detected
                 loganalyzer = LogAnalyzer(ansible_host=self.dut, marker_prefix="pfc_function_storm_detect_{}_port_{}".format(action, port))
                 marker = loganalyzer.init()
                 ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages")
                 reg_exp = loganalyzer.parse_regexp_file(src=ignore_file)
                 loganalyzer.ignore_regex.extend(reg_exp)
                 loganalyzer.expect_regex = []
                 loganalyzer.expect_regex.extend([EXPECT_PFC_WD_DETECT_RE])
                 loganalyzer.match_regex = []

                 port_toggle(self.dut, tbinfo, ports=[port])

                 logger.info("Verify that PFC storm is not detected on port {}".format(port))
                 result = loganalyzer.analyze(marker, fail=False)
                 if result["total"]["expected_missing_match"] == 0:
                     pytest.fail(result)

             except Exception as e:
                 pytest.fail(str(e))

             finally:
                 if self.storm_hndle:
                     logger.info("--- Stop PFC storm on port {}".format(port))
                     self.storm_hndle.stop_storm()
                 else:
                     logger.info("--- Disabling fake storm on port {} queue {}".format(port, self.queue_oid))
                     PfcCmd.set_storm_status(self.dut, self.queue_oid, "disabled")
                 logger.info("--- Stop PFCWD ---")
                 self.dut.command("pfcwd stop")
Exemplo n.º 12
0
def advanceboot_loganalyzer(duthosts, rand_one_dut_hostname, request):
    """
    Advance reboot log analysis.
    This fixture starts log analysis at the beginning of the test. At the end,
    the collected expect messages are verified and timing of start/stop is calculated.

    Args:
        duthosts : List of DUT hosts
        rand_one_dut_hostname: hostname of a randomly selected DUT
    """
    duthost = duthosts[rand_one_dut_hostname]
    test_name = request.node.name
    if "warm" in test_name:
        reboot_type = "warm"
    elif "fast" in test_name:
        reboot_type = "fast"
    else:
        reboot_type = "unknown"
    # Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
    # Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
    if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
        device_marks = [
            arg for mark in request.node.iter_markers(name='device_type')
            for arg in mark.args
        ]
        if 'vs' not in device_marks:
            pytest.skip('Testcase not supported for kvm')

    loganalyzer = LogAnalyzer(
        ansible_host=duthost,
        marker_prefix="test_advanced_reboot_{}".format(test_name),
        additional_files={
            '/var/log/swss/sairedis.rec':
            'recording on: /var/log/swss/sairedis.rec',
            '/var/log/frr/bgpd.log': ''
        })
    marker = loganalyzer.init()
    loganalyzer.load_common_config()

    ignore_file = os.path.join(TEMPLATES_DIR, "ignore_boot_messages")
    expect_file = os.path.join(TEMPLATES_DIR, "expect_boot_messages")
    ignore_reg_exp = loganalyzer.parse_regexp_file(src=ignore_file)
    expect_reg_exp = loganalyzer.parse_regexp_file(src=expect_file)

    loganalyzer.ignore_regex.extend(ignore_reg_exp)
    loganalyzer.expect_regex = []
    loganalyzer.expect_regex.extend(expect_reg_exp)
    loganalyzer.match_regex = []

    yield

    result = loganalyzer.analyze(marker, fail=False)
    analyze_result = {"time_span": dict(), "offset_from_kexec": dict()}
    offset_from_kexec = dict()

    for key, messages in result["expect_messages"].items():
        if "syslog" in key or "bgpd.log" in key:
            analyze_log_file(duthost, messages, analyze_result,
                             offset_from_kexec)
        elif "sairedis.rec" in key:
            analyze_sairedis_rec(messages, analyze_result, offset_from_kexec)

    for marker, time_data in analyze_result["offset_from_kexec"].items():
        marker_start_time = time_data.get("timestamp", {}).get("Start")
        reboot_start_time = analyze_result.get("reboot_time",
                                               {}).get("timestamp",
                                                       {}).get("Start")
        if reboot_start_time and reboot_start_time != "N/A" and marker_start_time:
            time_data["time_taken"] = (datetime.strptime(marker_start_time, FMT) -\
                datetime.strptime(reboot_start_time, FMT)).total_seconds()
        else:
            time_data["time_taken"] = "N/A"

    get_data_plane_report(analyze_result, reboot_type)
    result_summary = get_report_summary(analyze_result, reboot_type)
    logging.info(json.dumps(analyze_result, indent=4))
    logging.info(json.dumps(result_summary, indent=4))
    report_file_name = request.node.name + "_report.json"
    summary_file_name = request.node.name + "_summary.json"
    report_file_dir = os.path.realpath((os.path.join(os.path.dirname(__file__),\
        "../logs/platform_tests/")))
    report_file_path = report_file_dir + "/" + report_file_name
    summary_file_path = report_file_dir + "/" + summary_file_name
    if not os.path.exists(report_file_dir):
        os.makedirs(report_file_dir)
    with open(report_file_path, 'w') as fp:
        json.dump(analyze_result, fp, indent=4)
    with open(summary_file_path, 'w') as fp:
        json.dump(result_summary, fp, indent=4)
Exemplo n.º 13
0
def advanceboot_loganalyzer(duthosts, rand_one_dut_hostname):
    """
    Advance reboot log analysis.
    This fixture starts log analysis at the beginning of the test. At the end,
    the collected expect messages are verified and timing of start/stop is calculated.

    Args:
        duthosts : List of DUT hosts
        rand_one_dut_hostname: hostname of a randomly selected DUT
    """
    duthost = duthosts[rand_one_dut_hostname]
    loganalyzer = LogAnalyzer(ansible_host=duthost,
                              marker_prefix="test_advanced_reboot")
    marker = loganalyzer.init()
    loganalyzer.load_common_config()

    ignore_file = os.path.join(TEMPLATES_DIR, "ignore_boot_messages")
    expect_file = os.path.join(TEMPLATES_DIR, "expect_boot_messages")
    ignore_reg_exp = loganalyzer.parse_regexp_file(src=ignore_file)
    expect_reg_exp = loganalyzer.parse_regexp_file(src=expect_file)

    loganalyzer.ignore_regex.extend(ignore_reg_exp)
    loganalyzer.expect_regex = []
    loganalyzer.expect_regex.extend(expect_reg_exp)
    loganalyzer.match_regex = []

    yield

    result = loganalyzer.analyze(marker, fail=False)
    messages = result["expect_messages"].values()
    if not messages:
        logging.error("Expected messages not found in syslog")
        return
    messages = messages[0]

    service_restart_times = dict()
    service_patterns = {
        "Stopping": re.compile(r'.*Stopping.*service.*'),
        "Stopped": re.compile(r'.*Stopped.*service.*'),
        "Starting": re.compile(r'.*Starting.*service.*'),
        "Started": re.compile(r'.*Started.*service.*')
    }

    def service_time_check(message, status):
        time = message.split(duthost.hostname)[0].strip()
        service_name = message.split(status + " ")[1].split()[0]
        service_dict = service_restart_times.get(service_name,
                                                 {"timestamp": {}})
        timestamps = service_dict.get("timestamp")
        if status in timestamps:
            service_dict[status +
                         " count"] = service_dict.get(status + " count", 1) + 1
        timestamps[status] = time
        service_restart_times.update({service_name: service_dict})

    for message in messages:
        for status, pattern in service_patterns.items():
            if re.search(pattern, message):
                service_time_check(message, status)

    loganalyzer.save_extracted_log(dest="/tmp/log/syslog")
    logging.info(json.dumps(service_restart_times, indent=4))

    FMT = "%b %d %H:%M:%S.%f"
    for _, timings in service_restart_times.items():
        timestamps = timings["timestamp"]
        timings["stop_time"] = (datetime.strptime(timestamps["Stopped"], FMT) -\
            datetime.strptime(timestamps["Stopping"], FMT)).total_seconds() \
                if "Stopped" in timestamps and "Stopping" in timestamps else None

        timings["start_time"] = (datetime.strptime(timestamps["Started"], FMT) -\
            datetime.strptime(timestamps["Starting"], FMT)).total_seconds() \
                if "Started" in timestamps and "Starting" in timestamps else None

        timings["reboot_time"] = (datetime.strptime(timestamps["Started"], FMT) -\
            datetime.strptime(timestamps["Stopped"], FMT)).total_seconds() \
                if "Started" in timestamps and "Stopped" in timestamps else None

    files = glob.glob('/tmp/*-report.json')
    if files:
        filepath = files[0]
        with open(filepath) as json_file:
            report = json.load(json_file)
            service_restart_times.update(report)
    result = service_restart_times
    logging.info(json.dumps(result, indent=4))
Exemplo n.º 14
0
class TestPfcwdWb(SetupPfcwdFunc):
    """ Test PFCwd warm-reboot function and supporting methods """
    def storm_detect_path(self, port, queue, first_detect_after_wb=False):
        """
        Storm detection action and associated verifications

        Args:
            port(string) : DUT port
            queue(int): queue on the port that will be stormed
            first_detect_after_wb(bool): first detect iteration after warm reboot (default: False)
        """
        # for the first iteration after wb, do not write a marker to the log but specify the start msg from
        # where to search the logs
        start_marker = None
        if first_detect_after_wb:
            start_marker = "NOTICE swss#orchagent: :- setWarmStartState: orchagent warm start state changed to initialized"
        self.loganalyzer = LogAnalyzer(ansible_host=self.dut,
                                       marker_prefix="pfcwd_wb_storm_detect_port_{}_queue_{}".format(port, queue),
                                       start_marker=start_marker)
        marker = self.loganalyzer.init()
        time.sleep(5)
        ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages")
        reg_exp = self.loganalyzer.parse_regexp_file(src=ignore_file)
        self.loganalyzer.ignore_regex.extend(reg_exp)
        self.loganalyzer.expect_regex = []
        self.loganalyzer.expect_regex.extend([EXPECT_PFC_WD_DETECT_RE])
        self.loganalyzer.match_regex = []

        # ongoing storm. no need to start a new one
        if not first_detect_after_wb:
            if not self.pfc_wd['fake_storm']:
                self.storm_handle[port][queue].start_storm()
                time.sleep(15 * len(self.pfc_wd['queue_indices']))
            else:
                PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "enabled")
                time.sleep(5)
        else:
            # for the first iteration after wb, check the log for detect msgs for the ongoing storms
            self.loganalyzer.expected_matches_target = len(self.ports) * len(self.pfc_wd['queue_indices'])
            time.sleep(20)

        # storm detect check
        logger.info("Verify if PFC storm is detected on port {} queue {}".format(port, queue))
        self.loganalyzer.analyze(marker)

    def storm_restore_path(self, port, queue):
        """
        Storm restoration action and associated verifications

        Args:
            port(string) : DUT port
            queue(int): queue on the port where storm would be restored
        """
        marker = self.loganalyzer.update_marker_prefix("pfcwd_wb_storm_restore_port_{}_queue_{}".format(port, queue))
        time.sleep(5)
        ignore_file = os.path.join(TEMPLATES_DIR, "ignore_pfc_wd_messages")
        reg_exp = self.loganalyzer.parse_regexp_file(src=ignore_file)
        self.loganalyzer.ignore_regex.extend(reg_exp)
        self.loganalyzer.expect_regex = []
        self.loganalyzer.expect_regex.extend([EXPECT_PFC_WD_RESTORE_RE])
        self.loganalyzer.match_regex = []
        self.loganalyzer.expected_matches_target = 0

        if not self.pfc_wd['fake_storm']:
            self.storm_handle[port][queue].stop_storm()
            time.sleep(15)
        else:
            PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled")
            time.sleep(5)

        # storm restore check
        logger.info("Verify if PFC storm is restored on port {}".format(port))
        self.loganalyzer.analyze(marker)

    def defer_fake_storm(self, port, queue, start_defer, stop_defer):
        time.sleep(start_defer)
        DUT_ACTIVE.wait()
        PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "enabled")
        time.sleep(stop_defer)
        DUT_ACTIVE.wait()
        PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled")

    def run_test(self, port, queue, detect=True, storm_start=True, first_detect_after_wb=False,
                 storm_defer=False):
        """
        Test method that invokes the storm detection and restoration path which includes the traffic
        test

        Args:
            port(string) : DUT port
            queue(int): queue on the port which would be stormed/restored
            detect(bool): if the detect logic needs to be called (default: True)
            storm_start(bool): used to decide certain actions in the detect logic (default: True)
            first_detect_after_wb(bool): used to decide certain actions in the detect logic (default: False)
            storm_defer(bool): use the storm defer logic or not (default: False)
        """
        # for deferred storm, return to main loop for next action which is warm boot
        if storm_defer:
            if not self.pfc_wd['fake_storm']:
                self.storm_handle[port][queue].start_storm()
                self.storm_handle[port][queue].stop_storm()
            else:
                thread = InterruptableThread(
                    target=self.defer_fake_storm,
                    args=(port, queue, self.pfc_wd['storm_start_defer'],
                          self.pfc_wd['storm_stop_defer']))
                thread.daemon = True
                thread.start()
                self.storm_threads.append(thread)
            return

        if detect:
            if storm_start or first_detect_after_wb:
                logger.info("--- Storm detection path for port {} queue {} ---".format(port, queue))
                self.storm_detect_path(port, queue, first_detect_after_wb=first_detect_after_wb)
        else:
            logger.info("--- Storm restoration path for port {} queue {} ---".format(port, queue))
            self.storm_restore_path(port, queue)
        # test pfcwd functionality on a storm/restore
        self.traffic_inst.verify_wd_func(detect=detect)

    @pytest.fixture(autouse=True)
    def pfcwd_wb_test_cleanup(self):
        """
        Cleanup method
        """
        yield

        # stop all threads that might stuck in wait
        DUT_ACTIVE.set()
        for thread in self.storm_threads:
            thread_exception = thread.join(timeout=0.1,
                                           suppress_exception=True)
            if thread_exception:
                logger.debug("Exception in thread %r:", thread)
                logger.debug(
                    "".join(traceback.format_exception(*thread_exception))
                    )
        self.stop_all_storm()
        time.sleep(5)
        logger.info("--- Stop PFC WD ---")
        self.dut.command("pfcwd stop")

    def stop_all_storm(self):
        """
        Stop all the storms after each test run
        """
        if self.storm_handle:
            logger.info("--- Stopping storm on all ports ---")
            for port in self.storm_handle:
                for queue in self.storm_handle[port]:
                    if self.storm_handle[port][queue]:
                        logger.info("--- Stop pfc storm on port {} queue {}".format(port, queue))
                        self.storm_handle[port][queue].stop_storm()
                    else:
                        logger.info("--- Disabling fake storm on port {} queue {}".format(port, queue))
                        PfcCmd.set_storm_status(self.dut, self.oid_map[(port, queue)], "disabled")

    def pfcwd_wb_helper(self, fake_storm, testcase_actions, setup_pfc_test, fanout_graph_facts, ptfhost,
                        duthost, localhost, fanouthosts, two_queues):
        """
        Helper method that initializes the vars and starts the test execution

        Args:
            fake_storm(bool): if fake storm is enabled or disabled
            testcase_actions(list): list of actions that the test will go through
            setup_pfc_test(fixture): module scoped autouse fixture
            fanout_graph_facts(fixture): fanout info
            ptfhost(AnsibleHost): PTF instance
            duthost(AnsibleHost): DUT instance
            localhost(AnsibleHost): local instance
            fanouthosts(AnsibleHost): fanout instance
        """
        setup_info = setup_pfc_test
        self.fanout_info = fanout_graph_facts
        self.ptf = ptfhost
        self.dut = duthost
        self.fanout = fanouthosts
        self.timers = setup_info['pfc_timers']
        self.ports = setup_info['selected_test_ports']
        self.neighbors = setup_info['neighbors']
        dut_facts = self.dut.facts
        self.peer_dev_list = dict()
        self.seed = int(datetime.datetime.today().day)
        self.two_queues = two_queues
        self.storm_handle = dict()
        bitmask = 0
        storm_deferred = 0
        storm_restored = 0
        self.max_wait = 0
        self.fake_storm = fake_storm
        self.oid_map = dict()
        self.storm_threads = []

        for t_idx, test_action in enumerate(testcase_actions):
            if 'warm-reboot' in test_action:
                reboot(self.dut, localhost, reboot_type="warm")
                continue

            # one of the factors to decide if the storm needs to be started
            storm_restored = bitmask and (bitmask & 2)
            # if the action prior to the warm-reboot was a 'storm_defer', ensure that all the storms are
            # stopped
            storm_deferred = bitmask and (bitmask & 4)
            if storm_deferred:
                logger.info("Wait for all the deferred storms to start and stop ...")
                join_all(self.storm_threads, self.max_wait)
                self.storm_threads = []
                self.storm_handle = dict()

            bitmask = (1 << ACTIONS[test_action])
            for p_idx, port in enumerate(self.ports):
                logger.info("")
                logger.info("--- Testing on {} ---".format(port))
                self.setup_test_params(port, setup_info['vlan'], p_idx)
                for q_idx, queue in enumerate(self.pfc_wd['queue_indices']):
                    if not t_idx or storm_deferred:
                        if not q_idx:
                            self.storm_handle[port] = dict()
                        self.storm_handle[port][queue] = None

                        # setup the defer parameters if the storm is deferred currently
                        if (bitmask & 4):
                            self.storm_defer_setup()

                        if not self.pfc_wd['fake_storm']:
                            self.storm_setup(port, queue, storm_defer=(bitmask & 4))
                        else:
                            self.oid_map[(port, queue)] = PfcCmd.get_queue_oid(self.dut, port, queue)

                    self.traffic_inst = SendVerifyTraffic(self.ptf, dut_facts['router_mac'], self.pfc_wd, queue)
                    self.run_test(port, queue, detect=(bitmask & 1),
                                  storm_start=not t_idx or storm_deferred or storm_restored,
                                  first_detect_after_wb=(t_idx == 2 and not p_idx and not q_idx and not storm_deferred),
                                  storm_defer=(bitmask & 4))

    @pytest.fixture(params=['no_storm', 'storm', 'async_storm'])
    def testcase_action(self, request):
        """
        Parameters to invoke the pfcwd warm boot test

        Args:
            request(pytest) : pytest request object

        Yields:
            testcase_action(string) : testcase to execute
        """
        yield request.param

    def test_pfcwd_wb(self, fake_storm, testcase_action, setup_pfc_test, fanout_graph_facts, ptfhost, duthosts,
                      rand_one_dut_hostname, localhost, fanouthosts, two_queues):
        """
        Tests PFCwd warm reboot with various testcase actions

        Args:
            fake_storm(fixture): fake storm status
            testcase_action(fixture): testcase to execute (values: 'no_storm', 'storm', 'async_storm')

                'no_storm' : PFCwd storm detection/restore before and after warm reboot
                'storm' : PFC storm started and detected before warm-reboot. Storm is ongoing during warm boot and lasts
                          past the warm boot finish. Verifies if the storm is detected after warm-reboot.
                          PFC storm is stopped and 465 restored after warm boot
                'async_storm': PFC storm asynchronously starts at a random time and lasts a random period at fanout.
                               Warm reboot is done. Wait for all the storms to finish and then verify the storm detect/restore
                               logic

            setup_pfc_test(fixture) : Module scoped autouse fixture for PFCwd
            fanout_graph_facts(fixture) : fanout graph info
            ptfhost(AnsibleHost) : ptf host instance
            duthost(AnsibleHost) : DUT instance
            localhost(AnsibleHost) : localhost instance
            fanouthosts(AnsibleHost): fanout instance
        """
        duthost = duthosts[rand_one_dut_hostname]
        logger.info("--- {} ---".format(TESTCASE_INFO[testcase_action]['desc']))
        self.pfcwd_wb_helper(fake_storm, TESTCASE_INFO[testcase_action]['test_sequence'], setup_pfc_test,
                             fanout_graph_facts, ptfhost, duthost, localhost, fanouthosts, two_queues)
Exemplo n.º 15
0
def test_check_sfp_status_and_configure_sfp(duthost, conn_graph_facts):
    """
    @summary: Check SFP status and configure SFP

    This case is to use the sfputil tool and show command to check SFP status and configure SFP. Currently the
    only configuration is to reset SFP. Commands to be tested:
    * sfputil show presence
    * show interface transceiver presence
    * sfputil show eeprom
    * show interface transceiver eeprom
    * sfputil reset <interface name>
    """
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='sfp_cfg')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("kernel.*Eeprom query failed*")
        marker = loganalyzer.init()

    cmd_sfp_presence = "sudo sfputil show presence"
    cmd_sfp_eeprom = "sudo sfputil show eeprom"
    cmd_sfp_reset = "sudo sfputil reset"
    cmd_xcvr_presence = "show interface transceiver presence"
    cmd_xcvr_eeprom = "show interface transceiver eeprom"

    portmap = get_port_map(duthost)
    logging.info("Got portmap {}".format(portmap))

    logging.info("Check output of '%s'" % cmd_sfp_presence)
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check output of '%s'" % cmd_xcvr_presence)
    xcvr_presence = duthost.command(cmd_xcvr_presence)
    parsed_presence = parse_output(xcvr_presence["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_xcvr_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check output of '%s'" % cmd_sfp_eeprom)
    sfp_eeprom = duthost.command(cmd_sfp_eeprom)
    parsed_eeprom = parse_eeprom(sfp_eeprom["stdout_lines"])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_eeprom, "Interface is not in output of 'sfputil show eeprom'"
        assert parsed_eeprom[intf] == "SFP EEPROM detected"

    logging.info("Check output of '%s'" % cmd_xcvr_eeprom)
    xcvr_eeprom = duthost.command(cmd_xcvr_eeprom)
    parsed_eeprom = parse_eeprom(xcvr_eeprom["stdout_lines"])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_eeprom, "Interface is not in output of '%s'" % cmd_xcvr_eeprom
        assert parsed_eeprom[intf] == "SFP EEPROM detected"

    logging.info("Test '%s <interface name>'" % cmd_sfp_reset)
    tested_physical_ports = set()
    for intf in conn_graph_facts["device_conn"]:
        phy_intf = portmap[intf][0]
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("resetting {} physical interface {}".format(intf, phy_intf))
        reset_result = duthost.command("%s %s" % (cmd_sfp_reset, intf))
        assert reset_result["rc"] == 0, "'%s %s' failed" % (cmd_sfp_reset, intf)
        time.sleep(5)
    logging.info("Wait some time for SFP to fully recover after reset")
    time.sleep(60)

    logging.info("Check sfp presence again after reset")
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check interface status")
    mg_facts = duthost.minigraph_facts(host=duthost.hostname)["ansible_facts"]
    intf_facts = duthost.interface_facts(up_ports=mg_facts["minigraph_ports"])["ansible_facts"]
    assert len(intf_facts["ansible_interface_link_down_ports"]) == 0, \
        "Some interfaces are down: %s" % str(intf_facts["ansible_interface_link_down_ports"])

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
Exemplo n.º 16
0
def test_nhop(request, duthost, tbinfo):
    """
    Test next hop group resource count. Steps:
    - Add test IP address to an active IP interface
    - Add static ARPs
    - Create unique next hop groups
    - Add IP route and nexthop
    - check CRM resource
    - clean up
    - Verify no erros and crash
    """
    skip_release(duthost, ["201811", "201911"])

    default_max_nhop_paths = 32
    nhop_group_limit = 1024
    # program more than the advertised limit
    extra_nhops = 10

    asic = duthost.asic_instance()

    # find out MAX NHOP group count supported on the platform
    result = asic.run_redis_cmd(
        argv=["redis-cli", "-n", 6, "HGETALL", "SWITCH_CAPABILITY|switch"])
    it = iter(result)
    switch_capability = dict(zip(it, it))
    max_nhop = switch_capability.get("MAX_NEXTHOP_GROUP_COUNT")
    max_nhop = nhop_group_limit if max_nhop == None else int(max_nhop)
    nhop_group_count = min(max_nhop, nhop_group_limit) + extra_nhops

    # find out an active IP port
    ip_ifaces = asic.get_active_ip_interfaces(tbinfo).keys()
    pytest_assert(len(ip_ifaces), "No IP interfaces found")
    eth_if = ip_ifaces[0]

    # Generate ARP entries
    arp_count = 40
    arplist = Arp(duthost, asic, arp_count, eth_if)
    arplist.arps_add()

    # indices
    indices = range(arp_count)
    ip_indices = combinations(indices, default_max_nhop_paths)

    # intitialize log analyzer
    marker = "NHOP TEST PATH COUNT {} {}".format(nhop_group_count, eth_if)
    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix=marker)
    marker = loganalyzer.init()
    loganalyzer.load_common_config()
    loganalyzer.expect_regex = []
    loganalyzer.ignore_regex.extend(loganalyzer_ignore_regex_list())

    ip_prefix = ipaddr.IPAddress("192.168.0.0")

    # list of all IPs available to generate a nexthop group
    ip_list = arplist.ip_mac_list

    crm_before = get_crm_info(duthost, asic)

    # increase CRM polling time
    asic.command("crm config polling interval 10")

    logging.info("Adding {} next hops on {}".format(nhop_group_count, eth_if))

    # create nexthop group
    nhop = IPRoutes(duthost, asic)
    try:
        for i, indx_list in zip(range(nhop_group_count), ip_indices):
            # get a list of unique group of next hop IPs
            ips = [arplist.ip_mac_list[x].ip for x in indx_list]

            ip_route = "{}/31".format(ip_prefix + (2 * i))

            # add IP route with the next hop group created
            nhop.add_ip_route(ip_route, ips)

        nhop.program_routes()
        # wait for routes to be synced and programmed
        time.sleep(120)
        crm_after = get_crm_info(duthost, asic)

    finally:
        nhop.delete_routes()
        arplist.clean_up()
        asic.command("crm config polling interval {}".format(
            crm_before["polling"]))

    # check for any errors or crash
    loganalyzer.analyze(marker)

    # verify the test used up all the NHOP group resources
    # skip this check on Mellanox as ASIC resources are shared
    if not is_mellanox_device(duthost):
        pytest_assert(
            crm_after["available"] == 0,
            "Unused NHOP group resource: {}, used:{}".format(
                crm_after["available"], crm_after["used"]))
Exemplo n.º 17
0
def test_check_sfp_status_and_configure_sfp(duthosts, rand_one_dut_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo):
    """
    @summary: Check SFP status and configure SFP

    This case is to use the sfputil tool and show command to check SFP status and configure SFP. Currently the
    only configuration is to reset SFP. Commands to be tested:
    * sfputil show presence
    * show interface transceiver presence
    * sfputil show eeprom
    * show interface transceiver eeprom
    * sfputil reset <interface name>
    """
    duthost = duthosts[rand_one_dut_hostname]
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='sfp_cfg')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("kernel.*Eeprom query failed*")
        marker = loganalyzer.init()

    dev_conn = conn_graph_facts["device_conn"][duthost.hostname]

    # Get the interface pertaining to that asic
    portmap = get_port_map(duthost, enum_frontend_asic_index)
    logging.info("Got portmap {}".format(portmap))

    if enum_frontend_asic_index is not None:
        # Check if the interfaces of this AISC is present in conn_graph_facts
        dev_conn = {k:v for k, v in portmap.items() if k in conn_graph_facts["device_conn"][duthost.hostname]}
        logging.info("ASIC {} interface_list {}".format(enum_frontend_asic_index, dev_conn))

    cmd_sfp_presence = "sudo sfputil show presence"
    cmd_sfp_eeprom = "sudo sfputil show eeprom"
    cmd_sfp_reset = "sudo sfputil reset"
    cmd_xcvr_presence = "show interface transceiver presence"
    cmd_xcvr_eeprom = "show interface transceiver eeprom"

    global ans_host
    ans_host = duthost

    logging.info("Check output of '%s'" % cmd_sfp_presence)
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check output of '%s'" % cmd_xcvr_presence)
    xcvr_presence = duthost.command(cmd_xcvr_presence)
    parsed_presence = parse_output(xcvr_presence["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_xcvr_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check output of '%s'" % cmd_sfp_eeprom)
    sfp_eeprom = duthost.command(cmd_sfp_eeprom)
    parsed_eeprom = parse_eeprom(sfp_eeprom["stdout_lines"])
    for intf in dev_conn:
        assert intf in parsed_eeprom, "Interface is not in output of 'sfputil show eeprom'"
        assert parsed_eeprom[intf] == "SFP EEPROM detected"

    logging.info("Check output of '%s'" % cmd_xcvr_eeprom)
    xcvr_eeprom = duthost.command(cmd_xcvr_eeprom)
    parsed_eeprom = parse_eeprom(xcvr_eeprom["stdout_lines"])
    for intf in dev_conn:
        assert intf in parsed_eeprom, "Interface is not in output of '%s'" % cmd_xcvr_eeprom
        assert parsed_eeprom[intf] == "SFP EEPROM detected"

    logging.info("Test '%s <interface name>'" % cmd_sfp_reset)
    tested_physical_ports = set()
    for intf in dev_conn:
        phy_intf = portmap[intf][0]
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("resetting {} physical interface {}".format(intf, phy_intf))
        reset_result = duthost.command("%s %s" % (cmd_sfp_reset, intf))
        assert reset_result["rc"] == 0, "'%s %s' failed" % (cmd_sfp_reset, intf)
        time.sleep(5)
    logging.info("Wait some time for SFP to fully recover after reset")
    time.sleep(60)

    logging.info("Check sfp presence again after reset")
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check interface status")
    namespace = duthost.get_namespace_from_asic_id(enum_frontend_asic_index)
    mg_facts = duthost.get_extended_minigraph_facts(tbinfo)
    # TODO Remove this logic when minigraph facts supports namespace in multi_asic
    up_ports = mg_facts["minigraph_ports"]
    if enum_frontend_asic_index is not None:
        # Check if the interfaces of this AISC is present in conn_graph_facts
        up_ports = {k:v for k, v in portmap.items() if k in mg_facts["minigraph_ports"]}
    intf_facts = duthost.interface_facts(namespace=namespace, up_ports=up_ports)["ansible_facts"]
    assert len(intf_facts["ansible_interface_link_down_ports"]) == 0, \
        "Some interfaces are down: %s" % str(intf_facts["ansible_interface_link_down_ports"])

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
Exemplo n.º 18
0
def test_check_sfp_low_power_mode(duthosts, rand_one_dut_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo):
    """
    @summary: Check SFP low power mode

    This case is to use the sfputil tool command to check and set SFP low power mode
    * sfputil show lpmode
    * sfputil lpmode off
    * sfputil lpmode on
    """
    duthost = duthosts[rand_one_dut_hostname]
    asichost = duthost.get_asic(enum_frontend_asic_index)
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='sfp_lpm')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("Eeprom query failed")
        marker = loganalyzer.init()

    dev_conn = conn_graph_facts["device_conn"][duthost.hostname]

    # Get the interface pertaining to that asic
    portmap = get_port_map(duthost, enum_frontend_asic_index)
    logging.info("Got portmap {}".format(portmap))

    if enum_frontend_asic_index is not None:
        # Check if the interfaces of this AISC is present in conn_graph_facts
        dev_conn = {k:v for k, v in portmap.items() if k in conn_graph_facts["device_conn"][duthost.hostname]}
        logging.info("ASIC {} interface_list {}".format(enum_frontend_asic_index, dev_conn))

    cmd_sfp_presence = "sudo sfputil show presence"
    cmd_sfp_show_lpmode = "sudo sfputil show lpmode"
    cmd_sfp_set_lpmode = "sudo sfputil lpmode"

    global ans_host
    ans_host = duthost

    logging.info("Check output of '%s'" % cmd_sfp_show_lpmode)
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    original_lpmode = copy.deepcopy(parsed_lpmode)
    for intf in dev_conn:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Try to change SFP lpmode")
    tested_physical_ports = set()

    not_supporting_lpm_physical_ports = set()
    for intf in dev_conn:
        phy_intf = portmap[intf][0]
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue

        sfp_type_cmd = 'redis-cli -n 6 hget "TRANSCEIVER_INFO|{}" type'.format(intf)
        sfp_type_docker_cmd = asichost.get_docker_cmd(sfp_type_cmd, "database")
        sfp_type = duthost.command(sfp_type_docker_cmd)["stdout"]

        power_class_cmd = 'redis-cli -n 6 hget "TRANSCEIVER_INFO|{}" ext_identifier'.format(intf)
        power_class_docker_cmd = asichost.get_docker_cmd(power_class_cmd, "database")
        power_class = duthost.command(power_class_docker_cmd)["stdout"]

        if not "QSFP" in sfp_type or "Power Class 1" in power_class:
            logging.info("skip testing port {} which doesn't support LPM".format(intf))
            not_supporting_lpm_physical_ports.add(phy_intf)
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("setting {} physical interface {}".format(intf, phy_intf))
        new_lpmode = "off" if original_lpmode[intf].lower() == "on" else "on"
        lpmode_set_result = duthost.command("%s %s %s" % (cmd_sfp_set_lpmode, new_lpmode, intf))
        assert lpmode_set_result["rc"] == 0, "'%s %s %s' failed" % (cmd_sfp_set_lpmode, new_lpmode, intf)
    time.sleep(10)

    if len(tested_physical_ports) == 0:
        pytest.skip("None of the ports supporting LPM, skip the test")

    logging.info("Check SFP lower power mode again after changing SFP lpmode")
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Try to change SFP lpmode")
    tested_physical_ports = set()
    for intf in dev_conn:
        phy_intf = portmap[intf][0]
        if phy_intf in not_supporting_lpm_physical_ports:
            logging.info("skip testing port {} which doesn't support LPM".format(intf))
            continue
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("restoring {} physical interface {}".format(intf, phy_intf))
        new_lpmode = original_lpmode[intf].lower()
        lpmode_set_result = duthost.command("%s %s %s" % (cmd_sfp_set_lpmode, new_lpmode, intf))
        assert lpmode_set_result["rc"] == 0, "'%s %s %s' failed" % (cmd_sfp_set_lpmode, new_lpmode, intf)
    time.sleep(10)

    logging.info("Check SFP lower power mode again after changing SFP lpmode")
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Check sfp presence again after setting lpmode")
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in dev_conn:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check interface status")
    namespace = duthost.get_namespace_from_asic_id(enum_frontend_asic_index)
    mg_facts = duthost.get_extended_minigraph_facts(tbinfo)
    # TODO Remove this logic when minigraph facts supports namespace in multi_asic
    up_ports = mg_facts["minigraph_ports"]
    if enum_frontend_asic_index is not None:
        # Check if the interfaces of this AISC is present in conn_graph_facts
        up_ports = {k:v for k, v in portmap.items() if k in mg_facts["minigraph_ports"]}
    intf_facts = duthost.interface_facts(namespace=namespace, up_ports=up_ports)["ansible_facts"]
    assert len(intf_facts["ansible_interface_link_down_ports"]) == 0, \
        "Some interfaces are down: %s" % str(intf_facts["ansible_interface_link_down_ports"])

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
Exemplo n.º 19
0
def test_monitoring_critical_processes(duthosts, rand_one_dut_hostname, tbinfo,
                                       skip_vendor_specific_container):
    """Tests the feature of monitoring critical processes by Monit and Supervisord.

    This function will check whether names of critical processes will appear
    in the syslog if the autorestart were disabled and these critical processes
    were stopped.

    Args:
        duthosts: list of DUTs.
        rand_one_dut_hostname: hostname of DUT.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    duthost = duthosts[rand_one_dut_hostname]

    loganalyzer = LogAnalyzer(ansible_host=duthost,
                              marker_prefix="monitoring_critical_processes")
    loganalyzer.expect_regex = []
    bgp_neighbors = duthost.get_bgp_neighbors()
    up_bgp_neighbors = [
        k.lower() for k, v in bgp_neighbors.items()
        if v["state"] == "established"
    ]

    skip_containers = []
    skip_containers.append("database")
    skip_containers.append("gbsyncd")
    # Skip 'restapi' container since 'restapi' service will be restarted immediately after exited, which will not trigger alarm message.
    skip_containers.append("restapi")
    # Skip 'acms' container since 'acms' process is not running on lab devices and
    # another process `cert_converter.py' is set to auto-restart if exited.
    skip_containers.append("acms")
    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")
    skip_containers = skip_containers + skip_vendor_specific_container

    containers_in_namespaces = get_containers_namespace_ids(
        duthost, skip_containers)

    if "20191130" in duthost.os_version:
        expected_alerting_messages = get_expected_alerting_messages_monit(
            duthost, containers_in_namespaces)
    else:
        expected_alerting_messages = get_expected_alerting_messages_supervisor(
            duthost, containers_in_namespaces)

    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    stop_critical_processes(duthost, containers_in_namespaces)

    # Wait for 70 seconds such that Supervisord/Monit has a chance to write alerting message into syslog.
    logger.info(
        "Sleep 70 seconds to wait for the alerting messages in syslog...")
    time.sleep(70)

    logger.info("Checking the alerting messages from syslog...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")

    logger.info("Executing the config reload...")
    config_reload(duthost)
    logger.info("Executing the config reload was done!")

    ensure_all_critical_processes_running(duthost, containers_in_namespaces)

    if not postcheck_critical_processes_status(duthost, up_bgp_neighbors):
        pytest.fail("Post-check failed after testing the process monitoring!")
    logger.info(
        "Post-checking status of critical processes and BGP sessions was done!"
    )
Exemplo n.º 20
0
def test_check_sfp_low_power_mode(duthost, conn_graph_facts):
    """
    @summary: Check SFP low power mode

    This case is to use the sfputil tool command to check and set SFP low power mode
    * sfputil show lpmode
    * sfputil lpmode off
    * sfputil lpmode on
    """
    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix='sfp_lpm')
        loganalyzer.load_common_config()

        loganalyzer.ignore_regex.append("Eeprom query failed")
        marker = loganalyzer.init()

    cmd_sfp_presence = "sudo sfputil show presence"
    cmd_sfp_show_lpmode = "sudo sfputil show lpmode"
    cmd_sfp_set_lpmode = "sudo sfputil lpmode"

    portmap = get_port_map(duthost)
    logging.info("Got portmap {}".format(portmap))

    logging.info("Check output of '%s'" % cmd_sfp_show_lpmode)
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    original_lpmode = copy.deepcopy(parsed_lpmode)
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Try to change SFP lpmode")
    tested_physical_ports = set()
    for intf in conn_graph_facts["device_conn"]:
        phy_intf = portmap[intf][0]
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("setting {} physical interface {}".format(intf, phy_intf))
        new_lpmode = "off" if original_lpmode[intf].lower() == "on" else "on"
        lpmode_set_result = duthost.command("%s %s %s" % (cmd_sfp_set_lpmode, new_lpmode, intf))
        assert lpmode_set_result["rc"] == 0, "'%s %s %s' failed" % (cmd_sfp_set_lpmode, new_lpmode, intf)
    time.sleep(10)

    logging.info("Check SFP lower power mode again after changing SFP lpmode")
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Try to change SFP lpmode")
    tested_physical_ports = set()
    for intf in conn_graph_facts["device_conn"]:
        phy_intf = portmap[intf][0]
        if phy_intf in tested_physical_ports:
            logging.info("skip tested SFPs {} to avoid repeating operating physical interface {}".format(intf, phy_intf))
            continue
        tested_physical_ports.add(phy_intf)
        logging.info("restoring {} physical interface {}".format(intf, phy_intf))
        new_lpmode = original_lpmode[intf].lower()
        lpmode_set_result = duthost.command("%s %s %s" % (cmd_sfp_set_lpmode, new_lpmode, intf))
        assert lpmode_set_result["rc"] == 0, "'%s %s %s' failed" % (cmd_sfp_set_lpmode, new_lpmode, intf)
    time.sleep(10)

    logging.info("Check SFP lower power mode again after changing SFP lpmode")
    lpmode_show = duthost.command(cmd_sfp_show_lpmode)
    parsed_lpmode = parse_output(lpmode_show["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_lpmode, "Interface is not in output of '%s'" % cmd_sfp_show_lpmode
        assert parsed_lpmode[intf].lower() == "on" or parsed_lpmode[intf].lower() == "off", "Unexpected SFP lpmode"

    logging.info("Check sfp presence again after setting lpmode")
    sfp_presence = duthost.command(cmd_sfp_presence)
    parsed_presence = parse_output(sfp_presence["stdout_lines"][2:])
    for intf in conn_graph_facts["device_conn"]:
        assert intf in parsed_presence, "Interface is not in output of '%s'" % cmd_sfp_presence
        assert parsed_presence[intf] == "Present", "Interface presence is not 'Present'"

    logging.info("Check interface status")
    mg_facts = duthost.minigraph_facts(host=duthost.hostname)["ansible_facts"]
    intf_facts = duthost.interface_facts(up_ports=mg_facts["minigraph_ports"])["ansible_facts"]
    assert len(intf_facts["ansible_interface_link_down_ports"]) == 0, \
        "Some interfaces are down: %s" % str(intf_facts["ansible_interface_link_down_ports"])

    if duthost.facts["asic_type"] in ["mellanox"]:
        loganalyzer.analyze(marker)
Exemplo n.º 21
0
def test_turn_on_off_psu_and_check_psustatus(duthost, psu_controller):
    """
    @summary: Turn off/on PSU and check PSU status using 'show platform psustatus'
    """
    loganalyzer = LogAnalyzer(
        ansible_host=duthost,
        marker_prefix='turn_on_off_psu_and_check_psustatus')
    loganalyzer.load_common_config()

    loganalyzer.ignore_regex.append(
        "Error getting sensor data: dps460.*Kernel interface error")
    marker = loganalyzer.init()

    psu_line_pattern = re.compile(r"PSU\s+\d+\s+(OK|NOT OK|NOT PRESENT)")

    psu_num = get_psu_num(duthost)
    if psu_num < 2:
        pytest.skip(
            "At least 2 PSUs required for rest of the testing in this case")

    logging.info("Create PSU controller for testing")
    psu_ctrl = psu_controller
    if psu_ctrl is None:
        pytest.skip(
            "No PSU controller for %s, skip rest of the testing in this case" %
            duthost.hostname)

    logging.info(
        "To avoid DUT being shutdown, need to turn on PSUs that are not powered"
    )
    turn_all_psu_on(psu_ctrl)

    logging.info("Initialize test results")
    psu_test_results = {}
    if not check_all_psu_on(duthost, psu_test_results):
        pytest.skip(
            "Some PSU are still down, skip rest of the testing in this case")

    assert len(psu_test_results.keys()) == psu_num, \
        "In consistent PSU number output by '%s' and '%s'" % (CMD_PLATFORM_PSUSTATUS, "sudo psuutil numpsus")

    logging.info("Start testing turn off/on PSUs")
    all_psu_status = psu_ctrl.get_psu_status()
    for psu in all_psu_status:
        psu_under_test = None

        logging.info("Turn off PSU %s" % str(psu["psu_id"]))
        psu_ctrl.turn_off_psu(psu["psu_id"])
        time.sleep(5)

        cli_psu_status = duthost.command(CMD_PLATFORM_PSUSTATUS)
        for line in cli_psu_status["stdout_lines"][2:]:
            assert psu_line_pattern.match(line), "Unexpected PSU status output"
            fields = line.split()
            if fields[2] != "OK":
                psu_under_test = fields[1]
            check_vendor_specific_psustatus(duthost, line)
        assert psu_under_test is not None, "No PSU is turned off"

        logging.info("Turn on PSU %s" % str(psu["psu_id"]))
        psu_ctrl.turn_on_psu(psu["psu_id"])
        time.sleep(5)

        cli_psu_status = duthost.command(CMD_PLATFORM_PSUSTATUS)
        for line in cli_psu_status["stdout_lines"][2:]:
            assert psu_line_pattern.match(line), "Unexpected PSU status output"
            fields = line.split()
            if fields[1] == psu_under_test:
                assert fields[
                    2] == "OK", "Unexpected PSU status after turned it on"
            check_vendor_specific_psustatus(duthost, line)

        psu_test_results[psu_under_test] = True

    for psu in psu_test_results:
        assert psu_test_results[psu], "Test psu status of PSU %s failed" % psu

    loganalyzer.analyze(marker)
Exemplo n.º 22
0
def consumes_memory_and_checks_monit(duthost, container_name, vm_workers, new_syntax_enabled):
    """Invokes the 'stress' utility to consume memory more than the threshold asynchronously
    and checks whether the container can be stopped and restarted. After container was restarted,
    'stress' utility will be invoked again to consume memory and checks whether Monit was able to
    restart this container with or without help of new syntax.
    Loganalyzer is leveraged to check whether the log messages related to container stopped
    and started were generated.

    Args:
        duthost: The AnsibleHost object of DuT.
        container_name: Name of container.
        vm_workers: Number of workers which does the spinning on malloc()/free()
          to consume memory.
        new_syntax_enabled: Checks to make sure container will be restarted if it is set to be 
          `True`.

    Returns:
        None.
    """
    expected_alerting_messages = []
    expected_alerting_messages.append(".*restart_service.*Restarting service 'telemetry'.*")
    expected_alerting_messages.append(".*Stopping Telemetry container.*")
    expected_alerting_messages.append(".*Stopped Telemetry container.*")
    expected_alerting_messages.append(".*Starting Telemetry container.*")
    expected_alerting_messages.append(".*Started Telemetry container.*")

    loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix="test_memory_checker")
    loganalyzer.expect_regex = []
    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    thread_pool = ThreadPool()
    thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers))

    logger.info("Sleep '{}' seconds to wait for the alerting messages from syslog ...".format(WAITING_SYSLOG_MSG_SECS))
    time.sleep(WAITING_SYSLOG_MSG_SECS)

    logger.info("Checking the alerting messages related to container restart ...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")

    logger.info("Waiting for '{}' container to be restarted ...".format(container_name))
    restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
                           CONTAINER_CHECK_INTERVAL_SECS,
                           0,
                           check_container_state, duthost, container_name, True)
    pytest_assert(restarted, "Failed to restart '{}' container!".format(container_name))
    logger.info("'{}' container is restarted.".format(container_name))

    logger.info("Running 'stress' utility again in '{}' ...".format(container_name))
    thread_pool.apply_async(consume_memory, (duthost, container_name, vm_workers))

    check_counter = 0
    marker = loganalyzer.update_marker_prefix("test_monit_counter")
    logger.info("Checking memory usage of '{}' every 30 seconds for 6 times ...".format(container_name))
    while check_counter < 6:
        check_counter += 1
        mem_usage = get_container_mem_usage(duthost, container_name)
        logger.info("Memory usage of '{}' is '{}'".format(container_name, mem_usage))
        time.sleep(30)

    logger.info("Analyzing syslog messages to verify whether '{}' is restarted ...".format(container_name))
    analyzing_result = loganalyzer.analyze(marker, fail=False)
    if not new_syntax_enabled:
        pytest_assert(analyzing_result["total"]["expected_match"] == 0,
                      "Monit can reset counter and restart '{}'!".format(container_name))
        logger.info("Monit was unable to reset its counter and '{}' can not be restarted!".format(container_name))
    else:
        pytest_assert(analyzing_result["total"]["expected_match"] == len(expected_alerting_messages),
                      "Monit still can not restart '{}' with the help of new syntax!".format(container_name))
        logger.info("Monit was able to restart '{}' with the help of new syntax!".format(container_name))