def check_psu_sysfs(dut, psu_id, psu_state): """ @summary: Check psu related sysfs under /var/run/hw-management/thermal against psu_state """ psu_exist = "/var/run/hw-management/thermal/psu{}_status".format(psu_id) if psu_state == "NOT PRESENT": psu_exist_content = dut.command("cat {}".format(psu_exist)) logging.info("PSU state {} file {} read {}".format( psu_state, psu_exist, psu_exist_content["stdout"])) assert psu_exist_content[ "stdout"] == "0", "CLI returns NOT PRESENT while {} contains {}".format( psu_exist, psu_exist_content["stdout"]) else: platform_data = get_platform_data(dut) hot_swappable = platform_data["psus"]["hot_swappable"] if hot_swappable: psu_exist_content = dut.command("cat {}".format(psu_exist)) logging.info("PSU state {} file {} read {}".format( psu_state, psu_exist, psu_exist_content["stdout"])) assert psu_exist_content[ "stdout"] == "1", "CLI returns {} while {} contains {}".format( psu_state, psu_exist, psu_exist_content["stdout"]) psu_pwr_state = "/var/run/hw-management/thermal/psu{}_pwr_status".format( psu_id) psu_pwr_state_content = dut.command("cat {}".format(psu_pwr_state)) logging.info("PSU state {} file {} read {}".format( psu_state, psu_pwr_state, psu_pwr_state_content["stdout"])) assert (psu_pwr_state_content["stdout"] == "1" and psu_state == "OK") \ or (psu_pwr_state_content["stdout"] == "0" and psu_state == "NOT OK"), \ "sysfs content {} mismatches with psu_state {}".format(psu_pwr_state_content["stdout"], psu_state)
def _extract_num_of_fans_and_fan_drawers(self): """ Get FAN number and Fan number for each FAN drawer of this DUT. :return: """ if MockerHelper.INIT_FAN_NUM: return MockerHelper.INIT_FAN_NUM = True get_drawer_num_cmd = 'ls {}fan*_status | wc -l'.format(MockerHelper.THERMAL_PATH) output = self.dut.shell(get_drawer_num_cmd) content = output['stdout'].strip() if not content: return fan_drawer_num = int(content) get_fan_num_cmd = 'ls {}fan*_speed_get | wc -l'.format(MockerHelper.THERMAL_PATH) output = self.dut.shell(get_fan_num_cmd) content = output['stdout'].strip() if not content: return MockerHelper.FAN_NUM = int(content) platform_data = get_platform_data(self.dut) if not platform_data['fans']['hot_swappable']: # For non swappable fan, there is no drawer. We put them in a "virtual" drawer. MockerHelper.FAN_NUM_PER_DRAWER = MockerHelper.FAN_NUM return if MockerHelper.FAN_NUM > fan_drawer_num: MockerHelper.FAN_NUM_PER_DRAWER = 2 else: MockerHelper.FAN_NUM_PER_DRAWER = 1
def mock_fan_presence(self, status): platform_data = get_platform_data(self.mock_helper.dut) always_present = not platform_data['fans']['hot_swappable'] if always_present: return False, None value = 1 if status else 0 self.fan_drawer_data.mock_presence(value) return True, self.fan_data.name
def test_set_psu_fan_speed(duthost, mocker_factory): platform_data = get_platform_data(duthost) psu_num = platform_data['psus']['number'] hot_swappable = platform_data['psus']['hot_swappable'] if not hot_swappable: pytest.skip('The platform {} does not support this test case.'.format( duthost.facts["platform"])) logging.info('Create mocker, it may take a few seconds...') single_fan_mocker = mocker_factory(duthost, 'SingleFanMocker') logging.info('Mock FAN absence...') single_fan_mocker.mock_absence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, 10, operator.eq), \ 'Current cooling state is {}'.format(get_cooling_cur_state(duthost)) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_CHECK_INTERVAL)) time.sleep(THERMAL_CONTROL_TEST_CHECK_INTERVAL) full_speeds = [] for index in range(psu_num): speed = get_psu_speed(duthost, index) full_speeds.append(speed) logging.info('Full speed={}'.format(full_speeds)) logging.info('Mock FAN presence...') single_fan_mocker.mock_presence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, 10, operator.ne), \ 'Current cooling state is {}'.format(get_cooling_cur_state(duthost)) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_CHECK_INTERVAL)) time.sleep(THERMAL_CONTROL_TEST_CHECK_INTERVAL) cooling_cur_state = get_cooling_cur_state(duthost) logging.info('Cooling level changed to {}'.format(cooling_cur_state)) current_speeds = [] for index in range(psu_num): speed = get_psu_speed(duthost, index) current_speeds.append(speed) logging.info('Current speed={}'.format(current_speeds)) index = 0 if cooling_cur_state < 6: cooling_cur_state = 6 expect_multiple = float(10) / cooling_cur_state while index < psu_num: full_speed = full_speeds[index] current_speed = current_speeds[index] index += 1 if not full_speed or not current_speed: continue actual_multiple = float(full_speed) / current_speed if expect_multiple > actual_multiple: assert actual_multiple > expect_multiple * (1 - PSU_SPEED_TOLERANCE) elif expect_multiple < actual_multiple: assert actual_multiple < expect_multiple * (1 + PSU_SPEED_TOLERANCE)
def test_set_psu_fan_speed(duthosts, rand_one_dut_hostname, mocker_factory): duthost = duthosts[rand_one_dut_hostname] platform_data = get_platform_data(duthost) psu_num = platform_data['psus']['number'] hot_swappable = platform_data['psus']['hot_swappable'] if not hot_swappable: pytest.skip('The platform {} does not support this test case.'.format(duthost.facts["platform"])) psu_max_speed = get_psu_max_speed(duthost) logger.info('Create mocker, it may take a few seconds...') single_fan_mocker = mocker_factory(duthost, 'SingleFanMocker') logger.info('Mock FAN absence...') single_fan_mocker.mock_absence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME * 2, THERMAL_CONTROL_TEST_CHECK_INTERVAL, 0, check_psu_fan_speed, duthost, psu_num, psu_max_speed, operator.eq), 'Wait for PSU fan speed change to full speed failed' logger.info('Mock FAN presence...') single_fan_mocker.mock_presence() wait_result = wait_until(THERMAL_CONTROL_TEST_WAIT_TIME * 2, THERMAL_CONTROL_TEST_CHECK_INTERVAL, 0, check_psu_fan_speed, duthost, psu_num, psu_max_speed, operator.ne) if not wait_result: cooling_cur_state = get_cooling_cur_state(duthost) if cooling_cur_state == MAX_COOLING_LEVEL: cmd_output = str(duthost.command('show platform temperature')['stdout_lines']) cmd_output = cmd_output.replace("u'", "").replace(',', " ") cmd_output = re.split(r' +',cmd_output) cmd_output.pop(0) j = 0 table = [] while j != len(cmd_output): entry = [] for i in range(8): entry.append(cmd_output[j + i]) table.append(entry) j += 8 pytest.skip('Cooling level is still 10, ignore the rest test.\nIt might happen because the asic temperature is still high.\nCurrent system temperature:\n{}'.format(tabulate(table))) else: assert False, 'Wait for PSU fan speed change to normal failed'
def __init__(self, mock_helper, naming_rule, index): """ Constructor of FAN drawer data. :param mock_helper: Instance of MockHelper. :param naming_rule: Naming rule of this kind of Fan drawer. :param index: Fan drawer index. """ self.index = index self.helper = mock_helper self.platform_data = get_platform_data(self.helper.dut) if "201911" in self.helper.dut.os_version: self.mock_fan_direction = self.mock_fan_direction_fan_dir_for_all_fans else: self.mock_fan_direction = self.mock_fan_direction_fan_dir_per_fan if self.platform_data['fans']['hot_swappable']: self.name = 'drawer{}'.format(index) else: self.name = 'N/A' self.fan_data_list = [] self.mocked_presence = None self.mocked_direction = None if 'presence' in naming_rule: self.presence_file = naming_rule['presence'].format(index) else: self.presence_file = None if 'led_capability' in naming_rule: self.led_capability_file = naming_rule['led_capability'].format( index) else: self.led_capability_file = None if 'led_green' in naming_rule: self.led_green_file = naming_rule['led_green'].format(index) else: self.led_green_file = None if 'led_red' in naming_rule: self.led_red_file = naming_rule['led_red'].format(index) else: self.led_red_file = None if 'led_orange' in naming_rule: self.led_orange_file = naming_rule['led_orange'].format(index) else: self.led_orange_file = None
def mock_data(self): """ Mock random data for all Thermals in this DUT. :return: """ platform_data = get_platform_data(self.mock_helper.dut) thermal_dict = platform_data["thermals"] for category, content in thermal_dict.items(): number = int(content['number']) naming_rule = THERMAL_NAMING_RULE[category] if 'start' in content: start = int(content['start']) for index in range(start, start + number): mock_data = TemperatureData(self.mock_helper, naming_rule, index) self._do_mock(mock_data) else: # non index-able thermal mock_data = TemperatureData(self.mock_helper, naming_rule, None) self._do_mock(mock_data)
def test_psu_absence_policy(duthosts, rand_one_dut_hostname, mocker_factory): duthost = duthosts[rand_one_dut_hostname] platform_data = get_platform_data(duthost) hot_swappable = platform_data['psus']['hot_swappable'] if not hot_swappable: pytest.skip('The platform {} does not support this test case.'.format( duthost.facts["platform"])) psu_num = platform_data['psus']['number'] psu_mocker = mocker_factory(duthost, 'PsuMocker') psu_index = random.randint(1, psu_num) psu_mocker.mock_psu_status(psu_index, False) wait_result = wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, 0, check_pwm, duthost, MAX_PWM, operator.eq) assert wait_result, 'PSU is absent, but PWM value is not turned to {}'.format( MAX_PWM) assert check_fan_speed( duthost, MAX_PWM), 'Fan speed is not turn to {}'.format(MAX_PWM)
def test_set_psu_fan_speed(duthosts, rand_one_dut_hostname, mocker_factory): duthost = duthosts[rand_one_dut_hostname] platform_data = get_platform_data(duthost) psu_num = platform_data['psus']['number'] hot_swappable = platform_data['psus']['hot_swappable'] if not hot_swappable: pytest.skip('The platform {} does not support this test case.'.format( duthost.facts["platform"])) logging.info('Create mocker, it may take a few seconds...') single_fan_mocker = mocker_factory(duthost, 'SingleFanMocker') logging.info('Mock FAN absence...') single_fan_mocker.mock_absence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, 10, operator.eq), \ 'Current cooling state is {}'.format(get_cooling_cur_state(duthost)) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_WAIT_TIME)) time.sleep(THERMAL_CONTROL_TEST_WAIT_TIME) psu_max_speed = get_psu_max_speed(duthost) logging.info('Max PSU fan speed is {}'.format(psu_max_speed)) for index in range(psu_num): speed = get_psu_speed(duthost, index) logging.info('Speed for PSU {} fan is {}'.format(index, speed)) _check_psu_fan_speed_in_range(speed, psu_max_speed, 10) logging.info('Mock FAN presence...') single_fan_mocker.mock_presence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, 10, operator.ne), \ 'Current cooling state is {}'.format(get_cooling_cur_state(duthost)) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_CHECK_INTERVAL)) time.sleep(THERMAL_CONTROL_TEST_CHECK_INTERVAL) cooling_cur_state = get_cooling_cur_state(duthost) logging.info('Cooling level changed to {}'.format(cooling_cur_state)) if cooling_cur_state < 6: # PSU fan speed will never be less than 60% cooling_cur_state = 6 for index in range(psu_num): speed = get_psu_speed(duthost, index) logging.info('Speed for PSU {} fan is {}'.format(index, speed)) _check_psu_fan_speed_in_range(speed, psu_max_speed, cooling_cur_state)
def is_fan_removable(self): """ :return: True if FAN is removable else False """ platform_data = get_platform_data(self.mock_helper.dut) return platform_data['fans']['hot_swappable']
def mock_data(self): """ Mock random data for all FANs in this DUT. :return: """ fan_index = 1 drawer_index = 1 drawer_data = None presence = 0 direction = NOT_AVAILABLE naming_rule = FAN_NAMING_RULE['fan'] while fan_index <= MockerHelper.FAN_NUM: try: if (fan_index - 1) % MockerHelper.FAN_NUM_PER_DRAWER == 0: drawer_data = FanDrawerData(self.mock_helper, naming_rule, drawer_index) self.drawer_list.append(drawer_data) drawer_index += 1 presence = random.randint(0, 1) drawer_data.mock_presence(presence) drawer_data.mock_fan_direction(random.randint(0, 1)) if drawer_data.mocked_presence == 'Present': presence = 1 fan_data = FanData(self.mock_helper, naming_rule, fan_index) drawer_data.fan_data_list.append(fan_data) fan_index += 1 if presence == 1: fan_data.mock_status(random.randint(0, 1)) speed = random.randint(60, 100) fan_data.mock_speed(speed) fan_data.mock_target_speed(speed) self.expected_data[fan_data.name] = [ drawer_data.name, 'N/A', # update this value later fan_data.name, '{}%'.format(fan_data.mocked_speed), drawer_data.mocked_direction, drawer_data.mocked_presence, fan_data.mocked_status ] else: self.expected_data[fan_data.name] = [ drawer_data.name, 'red', fan_data.name, 'N/A', 'N/A', 'Not Present', 'N/A' ] except SysfsNotExistError as e: logging.info('Failed to mock fan data: {}'.format(e)) continue # update led color here for drawer_data in self.drawer_list: for fan_data in drawer_data.fan_data_list: if drawer_data.mocked_presence == 'Present': expected_data = self.expected_data[fan_data.name] expected_data[1] = drawer_data.get_expect_led_color() platform_data = get_platform_data(self.mock_helper.dut) psu_count = platform_data["psus"]["number"] naming_rule = FAN_NAMING_RULE['psu_fan'] for index in range(1, psu_count + 1): try: fan_data = FanData(self.mock_helper, naming_rule, index) speed = random.randint(60, 100) fan_data.mock_speed(speed) self.expected_data[fan_data.name] = [ 'N/A', '', fan_data.name, '{}%'.format(fan_data.mocked_speed), NOT_AVAILABLE, 'Present', 'OK' ] except SysfsNotExistError as e: logging.info('Failed to mock fan data for {} - {}'.format( fan_data.name, e)) continue
def check_sysfs(dut): """ @summary: Check various hw-management related sysfs under /var/run/hw-management """ platform_data = get_platform_data(dut) sysfs_config = generate_sysfs_config(platform_data) logging.info("Collect mellanox sysfs facts") sysfs_facts = dut.sysfs_facts(config=sysfs_config)['ansible_facts'] logging.info("Check broken symbolinks") broken_symbolinks = sysfs_facts['symbolink_info']['broken_links'] assert len(broken_symbolinks) == 0, \ "Found some broken symbolinks: {}".format(str(broken_symbolinks)) logging.info("Check ASIC related sysfs") try: asic_temp = float(sysfs_facts['asic_info']['temp']) / 1000 assert 0 < asic_temp < 105, "Abnormal ASIC temperature: {}".format( sysfs_facts['asic_info']['temp']) except Exception as e: assert False, "Bad content in /var/run/hw-management/thermal/asic: {}".format( repr(e)) logging.info("Check fan related sysfs") for fan_id, fan_info in sysfs_facts['fan_info'].items(): if platform_data["fans"]["hot_swappable"]: assert fan_info[ 'status'] == '1', "Fan {} status {} is not 1".format( fan_id, fan_info['status']) assert fan_info[ 'fault'] == '0', "Fan {} fault status {} is not 1".format( fan_id, fan_info['fault']) if not _is_fan_speed_in_range(sysfs_facts): sysfs_fan_config = [generate_sysfs_fan_config(platform_data)] assert wait_until(30, 5, _check_fan_speed_in_range, dut, sysfs_fan_config), "Fan speed not in range" logging.info("Check CPU related sysfs") cpu_temp_high_counter = 0 cpu_temp_list = [] cpu_crit_temp_list = [] cpu_pack_count = platform_data["cpu_pack"]["number"] if cpu_pack_count > 0: cpu_pack_temp = float(sysfs_facts['cpu_pack_info']['temp']) / 1000 cpu_pack_max_temp = float( sysfs_facts['cpu_pack_info']['max_temp']) / 1000 cpu_pack_crit_temp = float( sysfs_facts['cpu_pack_info']['crit_temp']) / 1000 assert cpu_pack_max_temp <= cpu_pack_crit_temp, "Bad CPU pack max temp or critical temp, {}, {} ".format( str(cpu_pack_max_temp), str(cpu_pack_crit_temp)) if cpu_pack_temp >= cpu_pack_crit_temp: cpu_temp_high_counter += 1 cpu_temp_list.append(cpu_pack_temp) cpu_crit_temp_list.append(cpu_pack_crit_temp) for core_id, cpu_info in sysfs_facts['cpu_core_info'].items(): cpu_core_temp = float(cpu_info["temp"]) / 1000 cpu_core_max_temp = float(cpu_info["max_temp"]) / 1000 cpu_core_crit_temp = float(cpu_info["crit_temp"]) / 1000 assert cpu_core_max_temp <= cpu_core_crit_temp, "Bad CPU core{} max temp or critical temp, {}, {} ".format( core_id, str(cpu_core_max_temp), str(cpu_core_crit_temp)) if cpu_core_temp >= cpu_core_crit_temp: cpu_temp_high_counter += 1 cpu_temp_list.append(cpu_core_temp) cpu_crit_temp_list.append(cpu_core_crit_temp) if cpu_temp_high_counter > 0: logging.info("CPU temperatures {}".format(cpu_temp_list)) logging.info("CPU critical temperatures {}".format(cpu_crit_temp_list)) assert False, "At least {} of the CPU cores or pack is overheated".format( cpu_temp_high_counter) logging.info("Check PSU related sysfs") if platform_data["psus"]["hot_swappable"]: for psu_id, psu_info in sysfs_facts['psu_info'].items(): psu_id = int(psu_id) psu_status = int(psu_info["status"]) if not psu_status: logging.info("PSU {} doesn't exist, skipped".format(psu_id)) continue psu_pwr_status = int(psu_info["pwr_status"]) if not psu_pwr_status: logging.info("PSU {} isn't power on, skipped".format(psu_id)) continue psu_temp = float(psu_info["temp"]) / 1000 psu_max_temp = float(psu_info["max_temp"]) / 1000 assert psu_temp < psu_max_temp, "PSU{} overheated, temp: {}".format( psu_id, str(psu_temp)) assert psu_info[ "max_temp_alarm"] == '0', "PSU{} temp alarm set".format(psu_id) try: psu_fan_speed = int(psu_info["fan_speed"]) assert psu_fan_speed > 1000, "Bad fan speed: {}".format( str(psu_fan_speed)) except Exception as e: assert "Invalid PSU fan speed value {} for PSU {}, exception: {}".format( psu_info["fan_speed"], psu_id, e) logging.info("Check SFP related sysfs") for sfp_id, sfp_info in sysfs_facts['sfp_info'].items(): assert sfp_info["temp_fault"] == '0', "SFP%d temp fault" % sfp_id sfp_temp = float(sfp_info['temp']) if sfp_info['temp'] != '0' else 0 sfp_temp_crit = float( sfp_info['crit_temp']) if sfp_info['crit_temp'] != '0' else 0 sfp_temp_emergency = float( sfp_info['emergency_temp'] ) if sfp_info['emergency_temp'] != '0' else 0 if sfp_temp_crit != 0: assert sfp_temp < sfp_temp_crit, "SFP{} overheated, temp{}".format( sfp_id, str(sfp_temp)) assert sfp_temp_crit < sfp_temp_emergency, "Wrong SFP critical temp or emergency temp, " \ "critical temp: {} emergency temp: {}".format( str(sfp_temp_crit), str(sfp_temp_emergency)) logging.info("Finish checking sysfs")
def test_set_psu_fan_speed(duthosts, rand_one_dut_hostname, mocker_factory): duthost = duthosts[rand_one_dut_hostname] platform_data = get_platform_data(duthost) psu_num = platform_data['psus']['number'] hot_swappable = platform_data['psus']['hot_swappable'] if not hot_swappable: pytest.skip('The platform {} does not support this test case.'.format( duthost.facts["platform"])) logging.info('Create mocker, it may take a few seconds...') single_fan_mocker = mocker_factory(duthost, 'SingleFanMocker') logging.info('Mock FAN absence...') single_fan_mocker.mock_absence() assert wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, MAX_COOLING_LEVEL, operator.eq), \ 'Current cooling state is {}'.format(get_cooling_cur_state(duthost)) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_WAIT_TIME)) time.sleep(THERMAL_CONTROL_TEST_WAIT_TIME) psu_max_speed = get_psu_max_speed(duthost) logging.info('Max PSU fan speed is {}'.format(psu_max_speed)) for index in range(psu_num): speed = get_psu_speed(duthost, index) logging.info('Speed for PSU {} fan is {}'.format(index, speed)) _check_psu_fan_speed_in_range(speed, psu_max_speed, MAX_COOLING_LEVEL) logging.info('Mock FAN presence...') single_fan_mocker.mock_presence() wait_until(THERMAL_CONTROL_TEST_WAIT_TIME, THERMAL_CONTROL_TEST_CHECK_INTERVAL, check_cooling_cur_state, duthost, MAX_COOLING_LEVEL, operator.ne) logging.info('Wait {} seconds for the policy to take effect...'.format( THERMAL_CONTROL_TEST_WAIT_TIME * 2)) # We have to wait THERMAL_CONTROL_TEST_WAIT_TIME * 2 seconds long here because: # Usually we only need wait THERMAL_CONTROL_TEST_WAIT_TIME seconds here to make sure thermal # control daemon change the cooling level to proper value, However, # there is chance that kernel might change cooling state back to MAX_COOLING_LEVEL after # user space thermal control adjust it to dynamic minimum value. So we have to wait longer for the # user space thermal control to set fan speed to dynamic minimum value again. It # means that we might need wait up to 2 thermal loops here. time.sleep(THERMAL_CONTROL_TEST_WAIT_TIME * 2) cooling_cur_state = get_cooling_cur_state(duthost) if cooling_cur_state == MAX_COOLING_LEVEL: cmd_output = str( duthost.command('show platform temperature')['stdout_lines']) cmd_output = cmd_output.replace("u'", "").replace(',', " ") cmd_output = re.split(r' +', cmd_output) cmd_output.pop(0) j = 0 table = [] while j != len(cmd_output): entry = [] for i in range(8): entry.append(cmd_output[j + i]) table.append(entry) j += 8 pytest.skip( 'Cooling level is still 10, ignore the rest test.\nIt might happen because the asic temperature is still high.\nCurrent system temperature:\n{}' .format(tabulate(table))) logging.info('Cooling level changed to {}'.format(cooling_cur_state)) if cooling_cur_state < 6: # PSU fan speed will never be less than 60% cooling_cur_state = 6 for index in range(psu_num): speed = get_psu_speed(duthost, index) logging.info('Speed for PSU {} fan is {}'.format(index, speed)) _check_psu_fan_speed_in_range(speed, psu_max_speed, cooling_cur_state)