def __init__(self, config=RAMFS_CONFIG, zone_config=CONFIG_DIR, log_level="warning"): Logger.start("fscd", log_level) Logger.info("Starting fscd") self.zone_config = zone_config self.fsc_config = self.get_fsc_config(config) # json dump from config self.boost = self.DEFAULT_BOOST self.non_fanfail_limited_boost = None self.boost_type = self.DEFAULT_BOOST_TYPE self.transitional = self.DEFAULT_TRANSITIONAL self.ramp_rate = self.DEFAULT_RAMP_RATE self.sensor_fail = None self.ssd_progressive_algorithm = None self.sensor_valid_check = None self.fail_sensor_type = None self.fan_dead_boost = None self.fan_fail = None self.fan_recovery_pending = False self.fan_recovery_time = None self.fan_limit_upper_pwm = None self.fan_limit_lower_pwm = None self.sensor_filter_all = False self.pwm_sensor_boost_value = None self.output_max_boost_pwm = False
def get_config_params(self): self.transitional = self.fsc_config["pwm_transition_value"] self.boost = self.fsc_config["pwm_boost_value"] if "fan_limit_upper_pwm" in self.fsc_config: self.fan_limit_upper_pwm = self.fsc_config["fan_limit_upper_pwm"] if "fan_limit_lower_pwm" in self.fsc_config: self.fan_limit_lower_pwm = self.fsc_config["fan_limit_lower_pwm"] if "non_fanfail_limited_boost_value" in self.fsc_config: self.non_fanfail_limited_boost = self.fsc_config[ "non_fanfail_limited_boost_value"] self.sensor_filter_all = self.fsc_config.get("sensor_filter_all", False) if "boost" in self.fsc_config and "fan_fail" in self.fsc_config[ "boost"]: self.fan_fail = self.fsc_config["boost"]["fan_fail"] if "boost" in self.fsc_config and "progressive" in self.fsc_config[ "boost"]: if self.fsc_config["boost"]["progressive"]: self.boost_type = "progressive" if "fan_dead_boost" in self.fsc_config: self.fan_dead_boost = self.fsc_config["fan_dead_boost"] self.all_fan_fail_counter = 0 if "output_max_boost_pwm" in self.fsc_config: self.output_max_boost_pwm = self.fsc_config["output_max_boost_pwm"] if "boost" in self.fsc_config and "sensor_fail" in self.fsc_config[ "boost"]: self.sensor_fail = self.fsc_config["boost"]["sensor_fail"] if self.sensor_fail: if "pwm_sensor_boost_value" in self.fsc_config: self.pwm_sensor_boost_value = self.fsc_config[ "pwm_sensor_boost_value"] if "fail_sensor_type" in self.fsc_config: self.fail_sensor_type = self.fsc_config["fail_sensor_type"] if "ssd_progressive_algorithm" in self.fsc_config: self.ssd_progressive_algorithm = self.fsc_config[ "ssd_progressive_algorithm"] if "sensor_valid_check" in self.fsc_config: self.sensor_valid_check = self.fsc_config["sensor_valid_check"] self.watchdog = self.fsc_config["watchdog"] if "fanpower" in self.fsc_config: self.fanpower = self.fsc_config["fanpower"] else: self.fanpower = False if "chassis_intrusion" in self.fsc_config: self.chassis_intrusion = self.fsc_config["chassis_intrusion"] else: self.chassis_intrusion = False if "enable_fsc_sensor_check" in self.fsc_config: self.enable_fsc_sensor_check = self.fsc_config[ "enable_fsc_sensor_check"] else: self.enable_fsc_sensor_check = False if "ramp_rate" in self.fsc_config: self.ramp_rate = self.fsc_config["ramp_rate"] if self.watchdog: Logger.info("watchdog pinging enabled") kick_watchdog() self.interval = self.fsc_config["sample_interval_ms"] / 1000.0 if "fan_recovery_time" in self.fsc_config: self.fan_recovery_time = self.fsc_config["fan_recovery_time"]
def build_zones(self): self.zones = [] counter = 0 for name, data in list(self.fsc_config["zones"].items()): filename = data["expr_file"] with open(os.path.join(self.zone_config, filename), "r") as exf: source = exf.read() Logger.info("Compiling FSC expression for zone:") Logger.info(source) (expr, inf) = fsc_expr.make_eval_tree(source, self.profiles) for name in inf["ext_vars"]: sdata = name.split(":") board = sdata[0] # sname never used. so comment out (avoid lint error) # sname = sdata[1] if board not in self.machine.frus: self.machine.nums[board] = [] self.machine.frus.add(board) if len(sdata) == 3: self.machine.nums[board].append(sdata[2]) zone = Zone( data["pwm_output"], expr, inf, self.transitional, counter, self.boost, self.sensor_fail, self.sensor_valid_check, self.fail_sensor_type, self.ssd_progressive_algorithm, ) counter += 1 self.zones.append(zone)
def handle_term(signum, frame): global wdfile board_callout(callout='init_fans', boost=DEFAULT_INIT_TRANSITIONAL) Logger.warn("killed by signal %d" % (signum, )) if signum == signal.SIGQUIT and wdfile: Logger.info("Killed with SIGQUIT - stopping watchdog.") stop_watchdog() sys.exit('killed')
def board_callout(callout="None", **kwargs): if "init_fans" in callout: boost = 100 if "boost" in kwargs: boost = kwargs["boost"] Logger.info("FSC init fans to boost=%s " % str(boost)) return yamp_set_all_pwm(boost) else: Logger.warn("Need to perform callout action %s" % callout) pass
def host_shutdown(): MAIN_POWER = "/sys/bus/i2c/drivers/syscpld/12-0031/pwr_main_n" USERVER_POWER = "/sys/bus/i2c/drivers/syscpld/12-0031/pwr_usrv_en" cmd = 'echo 0 > ' + USERVER_POWER Logger.info("host_shutdown() executing {}".format(cmd)) response = Popen(cmd, shell=True, stdout=PIPE).stdout.read() time.sleep(5) cmd = 'echo 0 > ' + MAIN_POWER Logger.info("host_shutdown() executing {}".format(cmd)) response = Popen(cmd, shell=True, stdout=PIPE).stdout.read() return response
def stop_watchdog(): """kick the watchdog device. """ f = subprocess.Popen(WDTCLI_CMD + " stop", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) info, err = f.communicate() if len(err) != 0: Logger.error("failed to stop watchdog device") else: Logger.info("watchdog stopped")
def host_shutdown(): MAIN_POWER = "/sys/bus/i2c/drivers/smbcpld/12-003e/cpld_in_p1220" USERVER_POWER = "/sys/bus/i2c/drivers/scmcpld/2-0035/com_exp_pwr_enable" cmd = 'echo 0 > ' + USERVER_POWER Logger.info("host_shutdown() executing {}".format(cmd)) response = Popen(cmd, shell=True, stdout=PIPE).stdout.read() time.sleep(5) cmd = 'echo 0 > ' + MAIN_POWER Logger.info("host_shutdown() executing {}".format(cmd)) response = Popen(cmd, shell=True, stdout=PIPE).stdout.read() return response
def board_callout(callout="None", **kwargs): """ Override this method for defining board specific callouts: - Exmaple chassis intrusion """ if "init_fans" in callout: boost = 100 if "boost" in kwargs: boost = kwargs["boost"] Logger.info("FSC init fans to boost=%s " % str(boost)) return set_all_pwm(boost) else: Logger.warn("Need to perform callout action %s" % callout) pass
def board_callout(callout='None', **kwargs): ''' Override this method for defining board specific callouts: - Exmaple chassis intrusion ''' if 'init_fans' in callout: boost = 100 if 'boost' in kwargs: boost = kwargs['boost'] Logger.info("FSC init fans to boost=%s " % str(boost)) return set_all_pwm(boost) else: Logger.warn("Need to perform callout action %s" % callout) pass
def board_callout(callout="None", **kwargs): """ Override this method for defining board specific callouts: - Exmaple chassis intrusion """ if "read_power" in callout: return bmc_read_power() elif "init_fans" in callout: boost = 100 # define a boost for the platform or respect fscd override if "boost" in kwargs: boost = kwargs["boost"] Logger.info("FSC init fans to boost=%s " % str(boost)) return set_all_pwm(boost) pass
def board_callout(callout='None', **kwargs): ''' Override this method for defining board specific callouts: - Exmaple chassis intrusion ''' if 'read_power' in callout: return bmc_read_power() elif 'init_fans' in callout: boost = 100 # define a boost for the platform or respect fscd override if 'boost' in kwargs: boost = kwargs['boost'] Logger.info("FSC init fans to boost=%s " % str(boost)) return set_all_pwm(boost) pass
def update_dead_fans(self, dead_fans): ''' Check for dead and recovered fans ''' last_dead_fans = dead_fans.copy() speeds = self.machine.read_fans(self.fans) print("\x1b[2J\x1b[H") sys.stdout.flush() for fan, rpms in list(speeds.items()): Logger.info("%s speed: %d RPM" % (fan.label, rpms)) if rpms < self.fsc_config['min_rpm']: dead_fans.add(fan) self.fsc_fan_action(fan, action='dead') else: dead_fans.discard(fan) recovered_fans = last_dead_fans - dead_fans newly_dead_fans = dead_fans - last_dead_fans if len(newly_dead_fans) > 0: if self.fanpower: Logger.warn("%d fans failed" % (len(dead_fans), )) else: Logger.crit("%d fans failed" % (len(dead_fans), )) for dead_fan in dead_fans: if self.fanpower: Logger.warn("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) else: Logger.crit("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) Logger.usbdbg("%s fail" % (dead_fan.label)) fan_fail_record_path = FAN_FAIL_RECORD_DIR + '%s' % ( dead_fan.label) if not os.path.isfile(fan_fail_record_path): fan_fail_record = open(fan_fail_record_path, 'w') fan_fail_record.close() for fan in recovered_fans: if self.fanpower: Logger.warn("%s has recovered" % (fan.label, )) else: Logger.crit("%s has recovered" % (fan.label, )) Logger.usbdbg("%s recovered" % (fan.label)) self.fsc_fan_action(fan, action='recover') fan_fail_record_path = FAN_FAIL_RECORD_DIR + '%s' % (fan.label) if os.path.isfile(fan_fail_record_path): os.remove(fan_fail_record_path) return dead_fans
def __init__(self, config=RAMFS_CONFIG, zone_config=CONFIG_DIR, log_level='warning'): Logger.start("fscd", log_level) Logger.info("Starting fscd") self.zone_config = zone_config self.fsc_config = self.get_fsc_config(config) # json dump from config self.boost = self.DEFAULT_BOOST self.boost_type = self.DEFAULT_BOOST_TYPE self.transitional = self.DEFAULT_TRANSITIONAL self.ramp_rate = self.DEFAULT_RAMP_RATE self.sensor_fail = None self.ssd_progressive_algorithm = None self.sensor_valid_check = None self.fail_sensor_type = None self.fan_dead_boost = None self.fan_fail = None
def run(self): """ Main FSCD method that builds from the fscd config and runs """ # Get everything from json and build profiles, fans, zones self.builder() self.fail_record_dir() self.machine.set_all_pwm(self.fans, self.transitional) self.fsc_set_all_fan_led(color="led_blue") mode = fan_mode["trans_mode"] self.zones[0].get_set_fan_mode(mode, action="write") last = time.time() dead_fans = set() if self.fanpower: time.sleep(30) while True: if self.watchdog: kick_watchdog() time.sleep(self.interval) if self.fanpower: if not self.get_fan_power_status(): self.fan_recovery_pending = True continue if self.fan_fail: if self.fan_recovery_pending and self.fan_recovery_time != None: # Accelerating, wait for a while time.sleep(self.fan_recovery_time) self.fan_recovery_pending = False # Get dead fans for determining speed dead_fans = self.update_dead_fans(dead_fans) now = time.time() time_difference = now - last last = now Logger.info("time_difference: %f" % (time_difference)) # Check sensors and update zones self.update_zones(dead_fans, time_difference)
def board_callout(callout="None", **kwargs): """ Override this method for defining board specific callouts: - Exmaple chassis intrusion """ if "init_fans" in callout: boost = 100 if "boost" in kwargs: boost = kwargs["boost"] Logger.info("FSC init fans to boost=%s " % str(boost)) return set_all_pwm(boost) elif "chassis_intrusion" in callout: # fan present cmd = "presence_util.sh fan" lines = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() fan_presence = 0 psu_presence = 0 tray_pull_out = 0 for line in lines.split("\n"): m = re.match(r"fan.*\s:\s+(\d+)", line) if m is not None: if int(m.group(1)) == 1: fan_presence += 1 # psu present cmd = "presence_util.sh psu" lines = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() for line in lines.split("\n"): m = re.match(r"psu.*\s:\s+(\d+)", line) if m is not None: if int(m.group(1)) == 1: psu_presence += 1 if fan_presence < 4: Logger.warn("chassis_intrusion Found Fan absent (%d/4)" % (fan_presence)) tray_pull_out = 1 if psu_presence < 2: Logger.warn("chassis_intrusion Found PSU absent (%d/2)" % (psu_presence)) return tray_pull_out else: Logger.warn("Need to perform callout action %s" % callout) pass
def build_zones(self): self.zones = [] counter = 0 for name, data in self.fsc_config['zones'].items(): filename = data['expr_file'] with open(os.path.join(self.zone_config, filename), 'r') as exf: source = exf.read() Logger.info("Compiling FSC expression for zone:") Logger.info(source) (expr, inf) = fsc_expr.make_eval_tree(source, self.profiles) for name in inf['ext_vars']: board, sname = name.split(':') self.machine.frus.add(board) zone = Zone(data['pwm_output'], expr, inf, self.transitional, counter, self.boost, self.fail_sensor_type, self.ssd_progressive_algorithm) counter += 1 self.zones.append(zone)
def get_config_params(self): self.transitional = self.fsc_config['pwm_transition_value'] self.boost = self.fsc_config['pwm_boost_value'] if 'boost' in self.fsc_config and 'fan_fail' in self.fsc_config[ 'boost']: self.fan_fail = self.fsc_config['boost']['fan_fail'] if 'boost' in self.fsc_config and 'progressive' in self.fsc_config[ 'boost']: if self.fsc_config['boost']['progressive']: self.boost_type = 'progressive' if 'fan_dead_boost' in self.fsc_config: self.fan_dead_boost = self.fsc_config['fan_dead_boost'] if 'boost' in self.fsc_config and 'sensor_fail' in self.fsc_config[ 'boost']: if self.fsc_config['boost']['sensor_fail']: if 'fail_sensor_type' in self.fsc_config: self.fail_sensor_type = self.fsc_config['fail_sensor_type'] if 'ssd_progressive_algorithm' in self.fsc_config: self.ssd_progressive_algorithm = self.fsc_config[ 'ssd_progressive_algorithm'] self.watchdog = self.fsc_config['watchdog'] if 'fanpower' in self.fsc_config: self.fanpower = self.fsc_config['fanpower'] else: self.fanpower = False if 'chassis_intrusion' in self.fsc_config: self.chassis_intrusion = self.fsc_config['chassis_intrusion'] else: self.chassis_intrusion = False if 'ramp_rate' in self.fsc_config: self.ramp_rate = self.fsc_config['ramp_rate'] self.wdfile = None if self.watchdog: Logger.info("watchdog pinging enabled") self.wdfile = open('/dev/watchdog', 'w+') if not self.wdfile: Logger.error("couldn't open watchdog device") else: self.wdfile.write('V') self.wdfile.flush() self.interval = self.fsc_config['sample_interval_ms'] / 1000.0
def get_config_params(self): self.transitional = self.fsc_config['pwm_transition_value'] self.boost = self.fsc_config['pwm_boost_value'] if 'boost' in self.fsc_config and 'fan_fail' in self.fsc_config[ 'boost']: self.fan_fail = self.fsc_config['boost']['fan_fail'] if 'boost' in self.fsc_config and 'progressive' in self.fsc_config[ 'boost']: if self.fsc_config['boost']['progressive']: self.boost_type = 'progressive' if 'fan_dead_boost' in self.fsc_config: self.fan_dead_boost = self.fsc_config['fan_dead_boost'] self.all_fan_fail_counter = 0 if 'boost' in self.fsc_config and 'sensor_fail' in self.fsc_config[ 'boost']: self.sensor_fail = self.fsc_config['boost']['sensor_fail'] if self.sensor_fail: if 'fail_sensor_type' in self.fsc_config: self.fail_sensor_type = self.fsc_config['fail_sensor_type'] if 'ssd_progressive_algorithm' in self.fsc_config: self.ssd_progressive_algorithm = self.fsc_config[ 'ssd_progressive_algorithm'] if 'sensor_valid_check' in self.fsc_config: self.sensor_valid_check = self.fsc_config['sensor_valid_check'] self.watchdog = self.fsc_config['watchdog'] if 'fanpower' in self.fsc_config: self.fanpower = self.fsc_config['fanpower'] else: self.fanpower = False if 'chassis_intrusion' in self.fsc_config: self.chassis_intrusion = self.fsc_config['chassis_intrusion'] else: self.chassis_intrusion = False if 'ramp_rate' in self.fsc_config: self.ramp_rate = self.fsc_config['ramp_rate'] if self.watchdog: Logger.info("watchdog pinging enabled") kick_watchdog() self.interval = self.fsc_config['sample_interval_ms'] / 1000.0 if 'fan_recovery_time' in self.fsc_config: self.fan_recovery_time = self.fsc_config['fan_recovery_time']
def update_dead_fans(self, dead_fans): ''' Check for dead and recovered fans ''' last_dead_fans = dead_fans.copy() speeds = self.machine.read_fans(self.fans) print("\x1b[2J\x1b[H") sys.stdout.flush() for fan, rpms in speeds.items(): Logger.info("%s speed: %d RPM" % (fan.label, rpms)) if rpms < self.fsc_config['min_rpm']: dead_fans.add(fan) self.fsc_fan_action(fan, action='dead') else: dead_fans.discard(fan) recovered_fans = last_dead_fans - dead_fans newly_dead_fans = dead_fans - last_dead_fans if len(newly_dead_fans) > 0: if self.fanpower: Logger.warn("%d fans failed" % (len(dead_fans), )) else: Logger.crit("%d fans failed" % (len(dead_fans), )) for dead_fan in dead_fans: if self.fanpower: Logger.warn("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) else: Logger.crit("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) Logger.usbdbg("%s fail" % (dead_fan.label)) for fan in recovered_fans: if self.fanpower: Logger.warn("%s has recovered" % (fan.label, )) else: Logger.crit("%s has recovered" % (fan.label, )) Logger.usbdbg("%s recovered" % (fan.label)) self.fsc_fan_action(fan, action='recover') return dead_fans
def host_shutdown(): SCM_POWER_COMMAND = "/usr/local/bin/wdtcli kick &> /dev/null; /usr/local/bin/wedge_power.sh off" TH_SWITCH_POWER_COMMAND = "source /usr/local/bin/openbmc-utils.sh; echo 0 > $SMBCPLD_SYSFS_DIR/th3_turn_on" GB_SWITCH_POWER_COMMAND = "source /usr/local/bin/openbmc-utils.sh; echo 0 > $SMBCPLD_SYSFS_DIR/gb_turn_on" switch_poweroff_cmd = "" brd_type = pal_get_board_type() if brd_type == "Wedge400": switch_poweroff_cmd = TH_SWITCH_POWER_COMMAND elif brd_type == "Wedge400C": switch_poweroff_cmd = GB_SWITCH_POWER_COMMAND else: Logger.crit("Cannot identify board type: %s" % brd_type) Logger.crit("Switch won't be resetting!") Logger.info("host_shutdown() executing {}".format(SCM_POWER_COMMAND)) response = Popen(SCM_POWER_COMMAND, shell=True, stdout=PIPE).stdout.read() time.sleep(5) if switch_poweroff_cmd != "": Logger.info("host_shutdown() executing {}".format(switch_poweroff_cmd)) response = Popen(switch_poweroff_cmd, shell=True, stdout=PIPE).stdout.read() return response
def yamp_host_shutdown(): # Do the best effort by : # 1. Turn off CPU # 2. Then turn off SCD # 3. Then turn off all PSU # We do this because, if any one of CPLD/FPGA is already # malfunctioning, we still want to turn off as much part of # the system as possible. SCD_POWER_REG = "/sys/bus/i2c/drivers/scdcpld/4-0023/scd_power_en" CPU_OFF = "/usr/local/bin/wedge_power.sh off" SCD_OFF = "echo 0 > " + SCD_POWER_REG # First, turn off most of the switch board Logger.info("host_shutdown() executing {}".format(SCD_OFF)) yamp_force_run_cmd(SCD_OFF) time.sleep(3) # Then, turn off X86 CPU Logger.info("host_shutdown() executing {}".format(CPU_OFF)) yamp_force_run_cmd(CPU_OFF) # Until FSCD is proven to be very stable on most versions of FSCD, # we will only turn off SCD and BMC, but not PSUs. # (When PSUs are all turned off, it's hard to recover in DC) return 0
def builder(self): ''' Method to extract from json and build all internal data staructures ''' # Build a bmc machine object - read/write sensors self.build_machine() # Extract everything from json self.get_config_params() self.build_fans() self.build_profiles() Logger.info("Available profiles: " + ", ".join(list(self.profiles.keys()))) self.build_zones() Logger.info("Read %d zones" % (len(self.zones))) Logger.info("Including sensors from: " + ", ".join(self.machine.frus))
def run(self, sensors, dt): ctx = {'dt': dt} outmin = 0 fail_ssd_count = 0 missing = set() for v in self.expr_meta['ext_vars']: board, sname = v.split(":") if sname in sensors[board]: sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ['ucr']: Logger.warn('Sensor %s reporting status %s' % (sensor.name, sensor.status)) outmin = self.transitional if self.fail_sensor_type != None: if 'standby_sensor_fail' in self.fail_sensor_type.keys(): if self.fail_sensor_type[ 'standby_sensor_fail'] == True: if sensor.status in ['na']: if re.match(r'SOC', sensor.name) != None: if 'server_sensor_fail' in self.fail_sensor_type.keys( ): if self.fail_sensor_type[ 'server_sensor_fail'] == True: ret = fsc_board.get_power_status( board) if ret: Logger.debug( "Server Sensor Fail") outmin = self.boost break elif re.match(r'SSD', sensor.name) != None: if 'SSD_sensor_fail' in self.fail_sensor_type.keys( ): if self.fail_sensor_type[ 'SSD_sensor_fail'] == True: fail_ssd_count = fail_ssd_count + 1 else: Logger.debug("Standby Sensor Fail") outmin = self.boost break else: missing.add(v) # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None if missing: Logger.warn('Missing sensors: %s' % (', '.join(missing), )) if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if not exprout: if not self.transitional_assert_flag: Logger.crit('ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) exprout = self.transitional self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit('DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if 'SSD_sensor_fail' in self.fail_sensor_type.keys(): if self.fail_sensor_type['SSD_sensor_fail'] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if 'offset_algorithm' in self.ssd_progressive_algorithm.keys( ): list_index = 0 for i in self.ssd_progressive_algorithm[ 'offset_algorithm']: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] break else: if list_index == len( self.ssd_progressive_algorithm[ 'offset_algorithm']): outmin = self.boost if exprout < outmin: exprout = outmin exprout = clamp(exprout, 0, 100) return exprout
def get_fsc_config(self, fsc_config): if os.path.isfile(fsc_config): Logger.info("Started, reading configuration from %s" % (fsc_config)) with open(fsc_config, 'r') as f: return json.load(f)
def update_zones(self, dead_fans, time_difference): """ TODO: Need to change logic here. # Platforms with chassis_intrusion mode enabled if chassis_intrusion: set the chassis_intrusion_boost_flag to 0 and then do necessary checks to set flag to 1 if chassis_intrusion_boost_flag: run boost mode else: run normal mode else # Platforms WITHOUT chassis_intrusion mode run normal mode """ sensors_tuples = self.machine.read_sensors(self.sensors) self.fsc_safe_guards(sensors_tuples) for zone in self.zones: Logger.info("PWM: %s" % (json.dumps(zone.pwm_output))) chassis_intrusion_boost_flag = 0 if self.chassis_intrusion: self_tray_pull_out = board_callout( callout='chassis_intrusion') if self_tray_pull_out == 1: chassis_intrusion_boost_flag = 1 if chassis_intrusion_boost_flag == 0: pwmval = zone.run(sensors=sensors_tuples, dt=time_difference) else: pwmval = self.boost if self.fan_fail: if self.boost_type == 'progressive' and self.fan_dead_boost: # Cases where we want to progressively bump PWMs dead = len(dead_fans) if dead > 0: Logger.info("Progressive mode: Failed fans: %s" % (', '.join([str(i.label) for i in dead_fans],))) for fan_count, rate in self.fan_dead_boost["data"]: if dead <= fan_count: pwmval = clamp(pwmval + (dead * rate), 0, 100) break else: pwmval = self.boost else: if dead_fans: # If not progressive ,when there is 1 fan failed, boost all fans Logger.info("Failed fans: %s" % ( ', '.join([str(i.label) for i in dead_fans],))) pwmval = self.boost if self.fan_dead_boost: # If all the fans failed take action after a few cycles if len(dead_fans) == len(self.fans): self.all_fan_fail_counter = self.all_fan_fail_counter + 1 Logger.warn("Currently all fans failed for {} cycles".format(self.all_fan_fail_counter)) if self.fan_dead_boost["threshold"] and self.fan_dead_boost["action"]: if self.all_fan_fail_counter >= self.fan_dead_boost["threshold"]: self.fsc_host_action( action=self.fan_dead_boost["action"], cause="All fans are bad for more than " + str(self.fan_dead_boost["threshold"]) + " cycles" ) else: # If atleast 1 fan is working reset the counter self.all_fan_fail_counter = 0 if abs(zone.last_pwm - pwmval) > self.ramp_rate: if pwmval < zone.last_pwm: pwmval = zone.last_pwm - self.ramp_rate else: pwmval = zone.last_pwm + self.ramp_rate zone.last_pwm = pwmval if hasattr(zone.pwm_output, '__iter__'): for output in zone.pwm_output: self.machine.set_pwm(self.fans.get( str(output)), pwmval) else: self.machine.set_pwm(self.fans[zone.pwm_output], pwmval)
def run(self, sensors, dt): ctx = {"dt": dt} outmin = 0 fail_ssd_count = 0 sensor_index = 0 cause_boost_count = 0 no_sane_flag = 0 mode = 0 for v in self.expr_meta["ext_vars"]: sensor_valid_flag = 1 sdata = v.split(":") board = sdata[0] sname = sdata[1] if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[ sensor_index] = fsc_board.sensor_valid_check( board, sname, check_name, self.sensor_valid_check[check_name] ["attribute"], ) # If current or previous sensor valid status is 0, ignore this sensor reading. # Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit("DEASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ["ucr"]: Logger.warn("Sensor %s reporting status %s" % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) if outmin == self.transitional: mode = fan_mode["trans_mode"] else: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if not os.path.isdir(SENSOR_FAIL_RECORD_DIR): os.mkdir(SENSOR_FAIL_RECORD_DIR) if (sensor.status in [ "na" ]) and (self.sensor_valid_cur[sensor_index] != -1): if re.match(r"SSD", sensor.name) != None: fail_ssd_count = fail_ssd_count + 1 else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) cause_boost_count += 1 if not os.path.isfile(sensor_fail_record_path): sensor_fail_record = open( sensor_fail_record_path, "w") sensor_fail_record.close() if outmin == self.boost: mode = fan_mode["boost_mode"] else: if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) else: if (not self.missing_sensor_assert_flag[sensor_index] ) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit("ASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if self.missing_sensor_assert_retry[sensor_index] < 2: self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[ sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit("ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed." % (self.counter)) exprout = self.transitional mode = fan_mode["trans_mode"] no_sane_flag = 1 self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit("DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed." % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if "SSD_sensor_fail" in list(self.fail_sensor_type.keys()): if self.fail_sensor_type["SSD_sensor_fail"] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if "offset_algorithm" in list( self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm[ "offset_algorithm"]: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] no_sane_flag = 0 break else: if list_index == len( self.ssd_progressive_algorithm[ "offset_algorithm"]): outmin = max(outmin, self.boost) cause_boost_count += 1 if outmin == self.boost: mode = fan_mode["boost_mode"] boost_record_path = RECORD_DIR + "sensor_fail_boost" if cause_boost_count != 0: if not os.path.isfile(boost_record_path): sensor_fail_boost_record = open(boost_record_path, "w") sensor_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin else: if no_sane_flag != 1: mode = fan_mode["normal_mode"] self.get_set_fan_mode(mode, action="write") exprout = clamp(exprout, 0, 100) return exprout
def update_zones(self, dead_fans, time_difference): """ TODO: Need to change logic here. # Platforms with chassis_intrusion mode enabled if chassis_intrusion: set the chassis_intrusion_boost_flag to 0 and then do necessary checks to set flag to 1 if chassis_intrusion_boost_flag: run boost mode else: run normal mode else # Platforms WITHOUT chassis_intrusion mode run normal mode # Platforms with enable_fsc_sensor_check mode enabled if enable_fsc_sensor_check: set the sensor_violated_flag to 0 and then do necessary checks to set flag to 1 if sensor_violated_flag: run boost mode else: run normal mode else # Platforms WITHOUT enable_fsc_sensor_check mode run normal mode """ ctx = {} if not self.sensor_filter_all: sensors_tuples = self.machine.read_sensors(self.sensors, None) self.fsc_safe_guards(sensors_tuples) for zone in self.zones: if self.sensor_filter_all: sensors_tuples = self.machine.read_sensors( self.sensors, zone.expr_meta) self.fsc_safe_guards(sensors_tuples) Logger.info("PWM: %s" % (json.dumps(zone.pwm_output))) mode = 0 chassis_intrusion_boost_flag = 0 sensor_violated_flag = 0 if self.chassis_intrusion: self_tray_pull_out = board_callout(callout="chassis_intrusion") if self_tray_pull_out == 1: chassis_intrusion_boost_flag = 1 if self.enable_fsc_sensor_check: Logger.info("enable_fsc_sensor_check") if self.fsc_sensor_check(sensors_tuples) != 0: sensor_violated_flag = 1 Logger.debug(" dead_fans(%d) " % len(dead_fans)) Logger.debug("Calculate") if chassis_intrusion_boost_flag == 0 and sensor_violated_flag == 0: ctx["dt"] = time_difference ctx["dead_fans"] = dead_fans ctx["last_pwm"] = zone.last_pwm ignore_fan_mode = False if self.non_fanfail_limited_boost and dead_fans: ignore_fan_mode = True pwmval = zone.run(sensors=sensors_tuples, ctx=ctx, ignore_mode=ignore_fan_mode) mode = zone.get_set_fan_mode(mode, action="read") # if we set pwm_sensor_boost_value option, assign it to pwmval if self.pwm_sensor_boost_value != None and \ int(mode) == fan_mode["boost_mode"]: if pwmval == self.boost: pwmval = self.pwm_sensor_boost_value else: pwmval = self.boost mode = fan_mode["boost_mode"] if self.fan_fail: boost_record_path = RECORD_DIR + "fan_fail_boost" if self.boost_type == "progressive" and self.fan_dead_boost: # Cases where we want to progressively bump PWMs dead = len(dead_fans) if dead > 0: Logger.info( "Progressive mode: Failed fans: %s" % (", ".join([str(i.label) for i in dead_fans]))) for fan_count, rate in self.fan_dead_boost["data"]: if dead <= fan_count: pwmval = clamp(pwmval + (dead * rate), 0, 100) mode = fan_mode["normal_mode"] if os.path.isfile(boost_record_path): os.remove(boost_record_path) break else: pwmval = self.boost mode = fan_mode["boost_mode"] if not os.path.isfile(boost_record_path): fan_fail_boost_record = open( boost_record_path, "w") fan_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) else: if dead_fans: # If not progressive ,when there is 1 fan failed, boost all fans Logger.info( "Failed fans: %s" % (", ".join([str(i.label) for i in dead_fans]))) # choose the higher PWM if self.output_max_boost_pwm: pwmval = self.boost if pwmval < self.boost else pwmval else: pwmval = self.boost mode = fan_mode["boost_mode"] if not os.path.isfile(boost_record_path): fan_fail_boost_record = open( boost_record_path, "w") fan_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if self.fan_dead_boost: # If all the fans failed take action after a few cycles if len(dead_fans) == len(self.fans): self.all_fan_fail_counter = self.all_fan_fail_counter + 1 Logger.warn( "Currently all fans failed for {} cycles". format(self.all_fan_fail_counter)) if (self.fan_dead_boost["threshold"] and self.fan_dead_boost["action"]): if (self.all_fan_fail_counter >= self.fan_dead_boost["threshold"]): self.fsc_host_action( action=self.fan_dead_boost["action"], cause="All fans are bad for more than " + str(self.fan_dead_boost["threshold"]) + " cycles", ) else: # If atleast 1 fan is working reset the counter self.all_fan_fail_counter = 0 if self.fan_limit_upper_pwm: if pwmval > self.fan_limit_upper_pwm: pwmval = self.fan_limit_upper_pwm if self.fan_limit_lower_pwm: if pwmval < self.fan_limit_lower_pwm: pwmval = self.fan_limit_lower_pwm # if no fan fail, the max of pwm is non_fanfail_limited_boost pwm: if self.non_fanfail_limited_boost and not dead_fans: pwmval = clamp(pwmval, 0, self.non_fanfail_limited_boost) if abs(zone.last_pwm - pwmval) > self.ramp_rate: if pwmval < zone.last_pwm: pwmval = zone.last_pwm - self.ramp_rate else: pwmval = zone.last_pwm + self.ramp_rate zone.last_pwm = pwmval if hasattr(zone.pwm_output, "__iter__"): for output in zone.pwm_output: self.machine.set_pwm(self.fans.get(str(output)), pwmval) else: self.machine.set_pwm(self.fans[zone.pwm_output], pwmval) zone.get_set_fan_mode(mode, action="write")
def run(self, sensors, ctx, ignore_mode): outmin = 0 fail_ssd_count = 0 valid_m2_count = 0 sensor_index = 0 cause_boost_count = 0 no_sane_flag = 0 display_progressive_flag = 0 mode = 0 for v in self.expr_meta["ext_vars"]: sensor_valid_flag = 1 sdata = v.split(":") board = sdata[0] sname = sdata[1] if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[ sensor_index] = fsc_board.sensor_valid_check( board, sname, check_name, self.sensor_valid_check[check_name] ["attribute"], ) # If current or previous sensor valid status is 0, ignore this sensor reading. # Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit("DEASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if re.match(r".*temp_dev", sname) != None: valid_m2_count = valid_m2_count + 1 if sensor.status in ["ucr"]: Logger.warn("Sensor %s reporting status %s" % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) if outmin == self.transitional: mode = fan_mode["trans_mode"] else: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if not os.path.isdir(SENSOR_FAIL_RECORD_DIR): os.mkdir(SENSOR_FAIL_RECORD_DIR) if (sensor.status in [ "na" ]) and (self.sensor_valid_cur[sensor_index] != -1): if (re.match(r"SSD", sensor.name) != None) or (re.match( r".*temp_dev", sname) != None): fail_ssd_count = fail_ssd_count + 1 Logger.warn("M.2 Device %s Fail" % v) else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) cause_boost_count += 1 if not os.path.isfile(sensor_fail_record_path): sensor_fail_record = open( sensor_fail_record_path, "w") sensor_fail_record.close() if outmin == self.boost: mode = fan_mode["boost_mode"] else: if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) else: if (not self.missing_sensor_assert_flag[sensor_index] ) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit("ASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if self.missing_sensor_assert_retry[sensor_index] < 2: self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None else: if sname in sensors[board]: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[ sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval_driver(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit( "ASSERT: Zone%d No sane fan speed could be calculated! Using transitional speed." % (self.counter)) exprout = self.transitional mode = fan_mode["trans_mode"] no_sane_flag = 1 self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit( "DEASSERT: Zone%d No sane fan speed could be calculated! Using transitional speed." % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: progressive_mode = True if ("M2_sensor_fail" in list(self.fail_sensor_type.keys())) and ( "M2_sensor_count" in list(self.fail_sensor_type.keys())): if (self.fail_sensor_type["M2_sensor_fail"] == True) and ( self.fail_sensor_type["M2_sensor_count"] > 0): if valid_m2_count == 0: if fsc_board.all_slots_power_off() == False: # Missing all module (no M.2 device) outmin = max(outmin, self.boost) cause_boost_count += 1 mode = fan_mode["boost_mode"] progressive_mode = False else: # All slots power off, do not boost up progressive_mode = False elif valid_m2_count != self.fail_sensor_type[ "M2_sensor_count"]: # Missing some module (M.2 devices partially populated) progressive_mode = False cause_boost_count += 1 else: # M.2 devices fully populated if cause_boost_count != 0: # other boost reasons: e.g. other sensors (not M.2 devices' sensors) fail to read sensors progressive_mode = False else: if fail_ssd_count != 0: # M.2 devices progressive_mode # handle M.2 devices/SSD fail to read case cause_boost_count += 1 # show out sensor fail record display_progressive_flag = ( 1) # do not override by normal mode mode = fan_mode["progressive_mode"] else: # M.2 devices noraml mode progressive_mode = False if progressive_mode and ("SSD_sensor_fail" in list( self.fail_sensor_type.keys())): if self.fail_sensor_type["SSD_sensor_fail"] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if "offset_algorithm" in list( self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm[ "offset_algorithm"]: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] no_sane_flag = 0 break else: if list_index == len( self.ssd_progressive_algorithm[ "offset_algorithm"]): outmin = max(outmin, self.boost) cause_boost_count += 1 if outmin == self.boost: mode = fan_mode["boost_mode"] boost_record_path = RECORD_DIR + "sensor_fail_boost" if cause_boost_count != 0: if not os.path.isfile(boost_record_path): sensor_fail_boost_record = open(boost_record_path, "w") sensor_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin else: if (no_sane_flag != 1) and (display_progressive_flag != 1): mode = fan_mode["normal_mode"] if not ignore_mode: self.get_set_fan_mode(mode, action="write") exprout = clamp(exprout, 0, 100) return exprout
def run(self, sensors, dt): ctx = {'dt': dt} outmin = 0 fail_ssd_count = 0 sensor_index = 0 for v in self.expr_meta['ext_vars']: sensor_valid_flag = 1 board, sname = v.split(":") if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[sensor_index] = fsc_board.sensor_valid_check(board, sname, check_name, self.sensor_valid_check[check_name]["attribute"]) #If current or previous sensor valid status is 0, ignore this sensor reading. #Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit('DEASSERT: Zone%d Missing sensors: %s' % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ['ucr']: Logger.warn('Sensor %s reporting status %s' % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) else: if self.sensor_fail == True: if (sensor.status in ['na']) and (self.sensor_valid_cur[sensor_index] != -1): if re.match(r'.+_C[2-4]_[0-3]_NVME_.+', sensor.name) != None: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) elif re.match(r'SSD', sensor.name) != None or re.match(r'(.*)nvme(.*)', sname) != None: fail_ssd_count = fail_ssd_count + 1 else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) else: if (not self.missing_sensor_assert_flag[sensor_index]) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit('ASSERT: Zone%d Missing sensors: %s' % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if (self.missing_sensor_assert_retry[sensor_index] < 2): self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit('ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) exprout = self.transitional self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit('DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if 'SSD_sensor_fail' in list(self.fail_sensor_type.keys()): if self.fail_sensor_type['SSD_sensor_fail'] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if 'offset_algorithm' in list(self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm['offset_algorithm']: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] break else: if list_index == len(self.ssd_progressive_algorithm['offset_algorithm']): outmin = max(outmin, self.boost) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin exprout = clamp(exprout, 0, 100) return exprout