def main(resource: DynamicFidServiceRA, action: str = '') -> int: """ Main function acts as switch case for DynamicFidServiceRA resource agent. Args: resource (DynamicFidServiceRA): Resource agent action (str): Resource agent action called by Pacemaker. Defaults to ''. Returns: int: Provide output as int code provided by pacemaker. """ try: if action == "meta-data": return resource.metadata() ConfigManager.init("resource_agent") Log.debug(f"{resource} initialized for action {action}") if action == "monitor": return resource_agent.monitor() elif action == "start": return resource_agent.start() elif action == "stop": return resource_agent.stop() else: print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]") exit(0) except Exception as e: Log.error( f"systemd_fid_wrapper_ra failed to perform {action}. Error: {e}") return const.OCF_ERR_GENERIC
def main(argv: list): try: if len(sys.argv) == 1: Cmd.usage("ha_setup") sys.exit(1) if sys.argv[1] == "cleanup": if not os.path.exists(const.HA_CONFIG_FILE): a_str = f'Cleanup can not be proceed as \ HA config file: {const.HA_CONFIG_FILE} \ is missing. Either cleanup is already done or there \ is some other problem' sys.stdout.write(a_str) return 0 ConfigManager.init("ha_setup") desc = "HA Setup command" command = Cmd.get_command(desc, argv[1:]) command.process() except Exception as err: Log.error("%s\n" % traceback.format_exc()) sys.stderr.write( f"Setup command:{argv[1]} failed for cortx-ha. Error: {err}\n") return errno.EINVAL
def __init__(self): """ Init method """ self.crm_env = None # Loads Alert event filter rules in the configuration ConfigManager.load_alert_events_rules()
def main(argv: list): try: if sys.argv[1] == "post_install": Conf.init(delim='.') Conf.load(const.HA_GLOBAL_INDEX, f"yaml://{const.SOURCE_CONFIG_FILE}") log_path = Conf.get(const.HA_GLOBAL_INDEX, "LOG.path") log_level = Conf.get(const.HA_GLOBAL_INDEX, "LOG.level") Log.init(service_name='ha_setup', log_path=log_path, level=log_level) else: ConfigManager.init("ha_setup") desc = "HA Setup command" command = Cmd.get_command(desc, argv[1:]) command.process() sys.stdout.write( f"Mini Provisioning {sys.argv[1]} configured successfully.\n") except Exception as err: Log.error("%s\n" % traceback.format_exc()) sys.stderr.write( f"Setup command:{argv[1]} failed for cortx-ha. Error: {err}\n") return errno.EINVAL
def main(action: str = '') -> int: """ Main function acts as switch case for IPHealthChecker resource agent. Args: action (str): Resource agent action called by Pacemaker. Defaults to ''. Returns: int: Provide output as int code provided by pacemaker. """ try: if action == "meta-data": return VipHealthMonitor.metadata() ConfigManager.init("resource_agent") resource_agent = VipHealthMonitor() Log.debug(f"{resource_agent} initialized for action {action}") if action == "monitor": return resource_agent.monitor() elif action == "start": return resource_agent.start() elif action == "stop": return resource_agent.stop() else: print(f"Usage {sys.argv[0]} [monitor] [start] [stop] [meta-data]") exit(0) except Exception as e: Log.error( f"vip health check failed to perform {action}. Error: {traceback.format_exc()} {e}" ) return const.OCF_ERR_GENERIC
def setUp(self): """ Setup the prerequisit of tests """ print("Setup") ConfigManager.init("test_Cluster_stop_sigterm") self.confstore = ConfigManager.get_confstore() MessageBus.init()
def __init__(self): """ Init method Create monitor objects and Sets the callbacks to sigterm """ signal.signal(signal.SIGTERM, self.set_sigterm) ConfigManager.init("fault_tolerance") self.node_fault_monitor = NodeFaultMonitor() self.cluster_stop_monitor = ClusterStopMonitor()
def init(self): """ Initalize EventAnalyzerService """ ConfigManager.init("event_analyzerd") Log.info("Event analyzer daemon initializations...") # Initialize system health confstore = ConfigManager.get_confstore() system_health = SystemHealth(confstore) # Initalize watcher self._watcher_list: dict = self._initalize_watcher(system_health)
def __init__(self, wait_time=10): """ Init method Create monitor objects and Sets the callbacks to sigterm """ try: # set sigterm handler signal.signal(signal.SIGTERM, self.set_sigterm) # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore # provisioner needs to be informed to add it in confstore (to be added there ) ConfigManager.init("k8s_resource_monitor") _conf_stor_search = ConftStoreSearch() self.monitors = [] # event output in pretty format kwargs = {K8SClientConst.PRETTY: True} # Seting a timeout value, 'timout_seconds', for the stream. # timeout value for connection to the server # If do not set then we will not able to stop immediately, # becuase synchronus function watch.stream() will not come back # until catch any event on which it is waiting. kwargs[K8SClientConst. TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT # Get MessageBus producer object for all monitor threads producer = self._get_producer() # Change to multiprocessing # Creating NODE monitor object node_monitor = ObjectMonitor(producer, K8SClientConst.NODE, **kwargs) self.monitors.append(node_monitor) _, nodes_list = _conf_stor_search.get_cluster_cardinality() if not nodes_list: Log.warn( f"No nodes in the cluster to watch for nodes_list: {nodes_list}" ) else: Log.info(f"Starting watch for: nodes_list: {nodes_list}") watcher_node_ids = ', '.join(node_id for node_id in nodes_list) kwargs[ K8SClientConst. LABEL_SELECTOR] = f'cortx.io/machine-id in ({watcher_node_ids})' # Creating POD monitor object pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs) self.monitors.append(pod_monitor) except Exception as err: Log.error(f'Monitor failed to start watchers: {err}')
def _main() -> None: args = _parse_arguments() ConfigManager.init(log_name="pre_disruptive_upgrade", log_path=RA_LOG_DIR, level="INFO") Log.info("Script invoked as executable with params: {}".format(vars(args))) check_cluster_health() if args.backup_consul: backup_consul() backup_configuration() cluster_standby_mode() delete_resources()
def __init__(self): """ Init method """ #Loads alert flter rules in the configuration ConfigManager.load_filter_rules() #Get filter type and resource types list from the alert rule file self.filter_type = Conf.get(const.ALERT_FILTER_INDEX, "alert.filter_type") self.resource_types_list = Conf.get(const.ALERT_FILTER_INDEX, "alert.resource_type")
def __init__(self): """ Init alert monitor """ super(AlertMonitor, self).__init__() ConfigManager.init("alert_monitor") # get environment variables self.crm_env = self._get_env() alert_event_filter = AlertEventFilter() alert_event_filter.initialize_crm(self.crm_env) # Modules like Node, Resource, Fencing / Modules event like node became member or node lost self.alert_event_module, self.alert_event_type = alert_event_filter.filter_event()
def setUp(self): ConfigManager.init('resource_agent') self.ts = int(time.time()) self.td = datetime.fromtimestamp( self.ts).strftime('%Y-%m-%dT%H:%M:%S.000000+0000') with open(const.RESOURCE_SCHEMA, 'r') as f: self.schema = json.load(f) self.hw_agent = HardwareResourceAgent(DecisionMonitor(), self.schema) self.key = f"cortx{const.HA_DELIM}base{const.HA_DELIM}ha{const.HA_DELIM}obj" self.filename = 'io_path_health_c1' self.path = 'io' self.local = self.schema['nodes']['local'] self.consul = consul.Consul()
def __init__(self): """ Init method. """ super(ClusterResourceParser, self).__init__() ConfigManager.init("event_analyzer") self.cluster_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}cluster_id") self.site_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}site_id") self.rack_id = Conf.get(const.HA_GLOBAL_INDEX, f"COMMON_CONFIG{_DELIM}rack_id") Log.info("ClusterResource Parser is initialized ...")
def __init__(self, msg=None): '''init method''' ConfigManager.init('event_analyzer') self._confstore = ConfigManager.get_confstore() system_health = SystemHealth(self._confstore) self._cluster_resource_filter = ClusterResourceFilter() self._cluster_resource_parser = ClusterResourceParser() if self._cluster_resource_filter.filter_event(msg): health_event = self._cluster_resource_parser.parse_event(msg) try: system_health.process_event(health_event) except Exception as e: Log.error(f"Failed to process event. Error: {e}") raise SubscriberException( f"Failed to process event {str(health_event)}. Error: {e}")
def perform_post_upgrade(ios_instances=None, s3_instances=None, do_unstandby=False, mgmt_info=None, node_count=None): '''Starting routine for post-upgrade process''' ConfigManager.init(log_name="post_disruptive_upgrade", log_path=RA_LOG_DIR, level="INFO") _check_for_any_resource_presence() _is_cluster_standby_on() _load_config() _create_resources(ios_instances, s3_instances, mgmt_info, node_count) if do_unstandby: _unstandby_cluster()
def __init__(self): self._conf_store = ConfigManager.get_confstore() self._machine_id = MachineId.get_machine_id() self._uuid = None self._is_resp_received = False self._encl_shutdown_successful = False self.timeout_reached = False
def __init__(self): """ Initalize PcsNodeController """ super(PcsNodeController, self).__init__() self._confstore = ConfigManager.get_confstore() self._system_health = SystemHealth(self._confstore)
def check_cluster_feasibility(self, node_id: str) -> dict: """ Check whether the cluster is going to be offline after node with node_id is stopped. Args: node_id (str): Node ID from cluster nodes. Returns: Dictionary : {"status": "", "msg":""} """ # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid node_name = ConfigManager.get_node_name(node_id=node_id) node_list = self._get_node_list() offline_nodes = self._get_offline_nodes() Log.debug(f"nodelist : {node_list} offlinenodes : {offline_nodes}") num_nodes = len(node_list) max_nodes_offline = num_nodes // 2 if num_nodes % 2 == 1 else ( num_nodes // 2) - 1 if (len(offline_nodes) + 1) > max_nodes_offline: Log.debug( f"Stopping the node {node_name} will cause a loss of the quorum" ) return { "status": const.STATUSES.FAILED.value, "output": "", "error": "Stopping the node will cause a loss of the quorum" } else: Log.debug( f"Stopping the node {node_name} will not cause a loss of the quorum" ) return { "status": const.STATUSES.SUCCEEDED.value, "output": "", "error": "" }
def __init__(self): """ Initialize IPMI Fencing Agent class. """ super(IpmiFencingAgent, self).__init__() self._confstore = ConfigManager.get_confstore() self._execute = SimpleCommand()
def __init__(self): """ Initialize pcs controller """ super(PcsController, self).__init__() self._execute = SimpleCommand() self._confstore = ConfigManager.get_confstore()
def __init__(self, version="2.0", default_log_enable=True): """ Manage cluster operation """ self._version = version # TODO: Update Config manager if log utility changes.(reference EOS-17614) if default_log_enable is True: ConfigManager.init("cluster_manager") else: ConfigManager.init(None) self._cluster_type = Conf.get(const.HA_GLOBAL_INDEX, f"CLUSTER_MANAGER{_DELIM}cluster_type") self._env = Conf.get(const.HA_GLOBAL_INDEX, f"CLUSTER_MANAGER{_DELIM}env") self._confstore = ConfigManager.get_confstore() # Raise exception if user does not have proper permissions self._validate_permissions() ConfigManager.load_controller_schema() self._controllers = ElementControllerFactory.init_controller( self._env, self._cluster_type) for controller in self._controllers.keys(): Log.info(f"Adding {controller} property to cluster manager.") # Add property method for controller # Example: cm.cluster_controller.start() # Find more example in test case. self.__dict__[controller] = self._controllers[controller]
def __init__(self, args: dict): """ Init method. """ self._url = args.config Conf.load(self._index, self._url) self._args = args.args self._execute = SimpleCommand() self._confstore = ConfigManager._get_confstore() self._cluster_manager = None
def __init__(self, default_log_enable, singleton_check: bool = False): """ Private Constructor. Make initialization work for Event Manager """ if singleton_check is False: raise Exception("Please use EventManager.get_instance() to fetch \ singleton instance of class") if EventManager.__instance is None: EventManager.__instance = self else: raise Exception( "EventManager is singleton class, use EventManager.get_instance()." ) if default_log_enable: ConfigManager.init(const.EVENT_MANAGER_LOG) self._confstore = ConfigManager.get_confstore() self._monitor_rule = MonitorRulesManager() self._default_action = HEALTH_MON_ACTIONS.PUBLISH_ACT.value MessageBus.init()
def __init__(self, wait_time=10): """ Init method Create monitor objects and Sets the callbacks to sigterm """ # set sigterm handler signal.signal(signal.SIGTERM, self.set_sigterm) # Read I/O pod selector label from ha.conf . Will be received from provisioner confstore # provisioner needs to be informed to add it in confstore (to be added there ) ConfigManager.init("k8s_resource_monitor") self.monitors = [] # event output in pretty format kwargs = {K8SClientConst.PRETTY: True} # Seting a timeout value, 'timout_seconds', for the stream. # timeout value for connection to the server # If do not set then we will not able to stop immediately, # becuase synchronus function watch.stream() will not come back # until catch any event on which it is waiting. kwargs[K8SClientConst. TIMEOUT_SECONDS] = K8SClientConst.VAL_WATCH_TIMEOUT_DEFAULT # Get MessageBus producer object for all monitor threads producer = self._get_producer() # Change to multiprocessing # Creating NODE monitor object node_monitor = ObjectMonitor(producer, K8SClientConst.NODE, **kwargs) self.monitors.append(node_monitor) pod_labels = Conf.get(const.HA_GLOBAL_INDEX, "data_pod_label") pod_label_str = ', '.join(pod_label for pod_label in pod_labels) # TODO : Change 'name' field to 'app' in label_selector if required. kwargs[K8SClientConst.LABEL_SELECTOR] = f'name in ({pod_label_str})' # Creating POD monitor object pod_monitor = ObjectMonitor(producer, K8SClientConst.POD, **kwargs) self.monitors.append(pod_monitor)
def stop_cluster(self): """ Sets the cluster stop key in confstore for the k8s monitor to notify cltuster shutdown is started """ Log.info(f'The cluster stop message on message bus ({self._message_type}) is received.') confstore = ConfigManager.get_confstore() if not confstore.key_exists(const.CLUSTER_STOP_KEY): Log.info(f'Setting key {const.CLUSTER_STOP_KEY} to {const.CLUSTER_STOP_VAL_ENABLE} in confstore.') confstore.set(const.CLUSTER_STOP_KEY, const.CLUSTER_STOP_VAL_ENABLE) else: Log.info(f'Updating key {const.CLUSTER_STOP_KEY} to {const.CLUSTER_STOP_VAL_ENABLE} in confstore.') confstore.update(const.CLUSTER_STOP_KEY, const.CLUSTER_STOP_VAL_ENABLE)
def main(resource, action=''): try: if action == 'meta-data': return resource.metadata() ConfigManager.init(log_name='resource_agent') with open(const.RESOURCE_SCHEMA, 'r') as f: resource_schema = json.load(f) os.makedirs(const.RA_LOG_DIR, exist_ok=True) resource_agent = resource(DecisionMonitor(), resource_schema) Log.debug(f"{resource_agent} initialized for action {action}") if action == 'monitor': return resource_agent.monitor() elif action == 'start': return resource_agent.start() elif action == 'stop': return resource_agent.stop() else: print('Usage %s [monitor] [start] [stop] [meta-data]' % sys.argv[0]) exit() except Exception as e: Log.error(f"{traceback.format_exc()}") return const.OCF_ERR_GENERIC
def stop(self, node_id: str, timeout: int = -1, **op_kwargs) -> dict: """ Stop Node with nodeid. Args: nodeid (str): Node ID from cluster nodes. Returns: ([dict]): Return dictionary. {"status": "", "output": "", "error": ""} status: Succeeded, Failed, InProgress """ check_cluster = op_kwargs.get("check_cluster") if op_kwargs.get( "check_cluster") is not None else True # Get the node_name (pvtfqdn) fron nodeid and raise exception if node_id is not valid node_name = ConfigManager.get_node_name(node_id=node_id) try: timeout = const.NODE_STOP_TIMEOUT if timeout < 0 else timeout node_status = self._system_health.get_node_status( node_id=node_id).get("status") if node_status == HEALTH_STATUSES.OFFLINE.value: Log.info( f"For stop node id {node_id}, Node already in offline state." ) status = f"Node with node id {node_id} is already in offline state." return { "status": const.STATUSES.SUCCEEDED.value, "output": status, "error": "" } elif node_status == HEALTH_STATUSES.FAILED.value: # In case VM, if node is Poweroff or Disconnected, system health will be updated with status FAILED. return { "status": const.STATUSES.FAILED.value, "output": "", "error": f"Node {node_id} status is {node_status}, node cannot be stopped." } else: if self.heal_resource(node_name): time.sleep(const.BASE_WAIT_TIME) if check_cluster: # Checks whether cluster is going to be offline if node with node_name is stopped. res = json.loads( self.check_cluster_feasibility(node_id=node_id)) if res.get("status") == const.STATUSES.FAILED.value: return res except Exception as e: raise ClusterManagerError( f"Failed to stop node {node_id}, Error: {e}")
def __init__(self, singleton_check: bool = False): """ Private Constructor. Make initialization work for HealthMonitorService Args: singleton_check (bool, optional): Create instance with get_instance. Defaults to False. """ if singleton_check is False: raise Exception( "Please use HealthMonitorService.get_instance() to fetch \ singleton instance of class") if HealthMonitorService.__instance is None: HealthMonitorService.__instance = self else: raise Exception( "HealthMonitorService is singleton class, use HealthMonitorService.get_instance()." ) # initialize ConfigManager.init(const.HEALTH_MONITOR_LOG) # set sigterm handler signal.signal(signal.SIGTERM, self.set_sigterm) self._confstore = ConfigManager.get_confstore() self._rule_manager = MonitorRulesManager() self._event_consumer = self._get_consumer()
def get_installation_type(self): hw_type = ConfigManager.get_hw_env() if hw_type is not None: install_type = hw_type.lower() else: Log.error("Error: Can not fetch h/w env from Config.") raise HaConfigException("h/w env not present in config.") nodes = self.get_nodelist(fetch_from=Cmd.HA_CONFSTORE) if len(nodes) == 1 and install_type == const.INSTALLATION_TYPE.VM: install_type = const.INSTALLATION_TYPE.SINGLE_VM Log.info(f"Nodes count = {len(nodes)}, Install type = {install_type}") return install_type