def get_machine_id(): execute = SimpleCommand() command = "cat /etc/machine-id" machine_id, err, rc = execute.run_cmd(command, check_error=True) Log.info( f"Read machine-id. Output: {machine_id}, Err: {err}, RC: {rc}") return machine_id.strip()
def __init__(self): """ Initialize DynamicFidServiceRA class. """ super(DynamicFidServiceRA, self).__init__() self._execute = SimpleCommand() self._status_list: list = ["failed", "active", "unknown"]
def __init__(self): """ Initialize pcs controller """ super(PcsController, self).__init__() self._execute = SimpleCommand() self._confstore = ConfigManager.get_confstore()
def __init__(self): """ Init IEM generator """ self._execute = SimpleCommand() with open(IEM_SCHEMA, 'r') as iem_schema_file: self.iem_alert_data = json.load(iem_schema_file)
def __init__(self): """ Initialize IPMI Fencing Agent class. """ super(IpmiFencingAgent, self).__init__() self._confstore = ConfigManager.get_confstore() self._execute = SimpleCommand()
def del_attr(resource: str): try: executor = SimpleCommand() executor.run_cmd(f"attrd_updater -U 0 -n {resource}", check_error=False) except Exception as e: Log.error( f"Problem in deleting attr - resource: {resource}, Error: {e}")
def __init__(self, args: dict): """ Init method. """ self._url = args.config Conf.load(self._index, self._url) self._args = args.args self._execute = SimpleCommand() self._confstore = ConfigManager._get_confstore() self._cluster_manager = None
def check_cluster_health() -> None: """ Check cluster status and make sure cluster is healthy """ # Check if cluster running _, _, rc = SimpleCommand().run_cmd(PCS_CLUSTER_STATUS, check_error=False) if rc != 0: raise UpgradeError("Cluster is not running on current node") output, _, _ = SimpleCommand().run_cmd(PCS_FAILCOUNT_STATUS) if "INFINITY" in output: raise UpgradeError( f"Cluster is not stable, some resource are not healthy. {output}")
def update_attr(scope: str, resource: str, instances_per_node: int, node_name:str = None): if scope == AttribScope.CLUSTER: node_name = None try: count: int = AttribUpdater.get_count(resource, instances_per_node, node_name) if count >= 0: attrib_name: str = resource + "-count" executor = SimpleCommand() executor.run_cmd(f"attrd_updater -U {count} -n {attrib_name}", check_error=False) except Exception as e: Log.error(f"Problem in updating attr - count of resource: {resource}. Error: {e}")
def s3servers(cib_xml, instance, push=False): """Create resources that belong to s3server group and clone the group. S3 background consumer is ordered after s3server and co-located with it. """ for i in range(1, int(instance) + 1): cmd_s3server = f"pcs -f {cib_xml} resource create s3server-{i} ocf:seagate:dynamic_fid_service_ra service=s3server fid_service_name=s3service --group io_group --force" SimpleCommand().run_cmd(cmd_s3server) cmd_s3bc = f"pcs -f {cib_xml} resource create s3backcons systemd:s3backgroundconsumer meta failure-timeout=300s --group io_group" SimpleCommand().run_cmd(cmd_s3bc) if push: cib_push(cib_xml)
def get_count(resource: str, instances_per_node: int, node_name: str = None): count = 0 try: executor = SimpleCommand() for instance in range(1, instances_per_node+1): service = resource + "-" + str(instance) out, _, _ = executor.run_cmd(f"attrd_updater -Q -A -n {service}", check_error=False) count += AttribUpdater._get_count_from_output(out, node_name) except Exception as e: Log.error(f"Problem in fetching attr. resource: {resource}. Error: {e}") count = -1 return count
def uds(cib_xml, push=False): """Create uds resource and constraints.""" cmd_uds = f"pcs -f {cib_xml} resource create uds systemd:uds op monitor interval=30s" SimpleCommand().run_cmd(cmd_uds) constraints = [ f"pcs -f {cib_xml} constraint colocation add uds with csm-agent score=INFINITY", # According to EOS-9258, there is a bug which requires UDS to be started after csm_agent f"pcs -f {cib_xml} constraint order csm-agent then uds" ] for c in constraints: SimpleCommand().run_cmd(c) if push: cib_push(cib_xml)
def __init__(self): """ PcsCluster manage pacemaker/corosync cluster """ super(PcsClusterManager, self).__init__() self._execute = SimpleCommand() self._decision_monitor = DecisionMonitor() # TODO: add node_manager class to handle query self._refresh_contex = PcsRefreshContex(self._decision_monitor) # TODO move node logic to node manager class self._node_status = [ 'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected' ]
def _get_resource_list() -> list: """Get list of resource""" resources: list = [] # clear history before getting list of resource SimpleCommand().run_cmd(PCS_RESOURCE_REFRESH) output, _, _ = SimpleCommand().run_cmd(LIST_PCS_RESOURCES, check_error=False) if "NO resources" in output: return resources for resource in output.split("\n"): res = resource.split(":")[0] if res != "" and res not in resources: resources.append(res) return resources
def free_space_monitor(cib_xml, push=False): """Create free space monitor resource. 1 per cluster, no affinity.""" cmd_fsm = f"pcs -f {cib_xml} resource create motr-free-space-mon systemd:motr-free-space-monitor op monitor interval=30s meta failure-timeout=300s" SimpleCommand().run_cmd(cmd_fsm) constraints = [ f"pcs -f {cib_xml} constraint order motr-ios-1 then motr-free-space-mon", f"pcs -f {cib_xml} constraint colocation add motr-free-space-mon with motr-ios-1" ] for c in constraints: SimpleCommand().run_cmd(c) if push: cib_push(cib_xml)
class NodeAlertMonitor(AlertMonitor): def __init__(self): """ Init node alert monitor """ super(NodeAlertMonitor, self).__init__() self.process = SimpleCommand() def _get_online_nodes(self): """ Get list of online nodes ids. """ online_nodes_xml = self.process.run_cmd(const.GET_ONLINE_NODES_CMD) # create element tree object root = ET.fromstring(online_nodes_xml[0]) nodes_ids = [] # iterate news items for item in root.findall('nodes'): # iterate child elements of item for child in item: if child.attrib['online'] == 'true': nodes_ids.append(child.attrib['id']) Log.info(f"List of online nodes ids in cluster in sorted ascending order: {sorted(nodes_ids)}") return sorted(nodes_ids) def _get_local_node(self): """ Get Local node name and id. """ local_node_id = self.process.run_cmd(const.GET_LOCAL_NODE_ID_CMD) local_node_name = self.process.run_cmd(const.GET_LOCAL_NODE_NAME_CMD) Log.info(f"Local node name: {local_node_name[0]} \n Local node id: {local_node_id[0]}") return local_node_id[0], local_node_name[0] def process_alert(self): Log.debug("Processing event for NodeAlertMonitor") # Environment variable are available in self.crm_env self.iem = IemGenerator() # Get online nodeids from corosync. nodes_ids = self._get_online_nodes() local_node_id, local_node_name = self._get_local_node() # Generate and send IEM only through the highest online node in cluster. if nodes_ids[-1].strip() == local_node_id.strip(): self.iem.generate_iem(self.crm_env["CRM_alert_node"], self.alert_event_module, self.alert_event_type) Log.info(f"Sent IEM alert from the node - name: {local_node_name}, id: {local_node_id}") else: Log.debug( f"This node does not have highest id. Local node id : {local_node_id}, all nodes: {nodes_ids.sort()}.")
def haproxy(cib_xml, push=False): """Create haproxy clone resource in pacemaker.""" cmd_haproxy = f"pcs -f {cib_xml} resource create haproxy systemd:haproxy op monitor interval=30 --group io_group" SimpleCommand().run_cmd(cmd_haproxy) if push: cib_push(cib_xml)
def s3auth(cib_xml, push=False): """Create haproxy S3 auth server resource in pacemaker.""" cmd_s3auth = f"pcs -f {cib_xml} resource create s3auth systemd:s3authserver clone op monitor interval=30" SimpleCommand().run_cmd(cmd_s3auth) if push: cib_push(cib_xml)
def cluster_create(cluster_name, nodelist, enable=True, put_standby=True): """ Create cluster on given nodes. Enables and starts cluster if needed. Parameters: cluster_name - name of the cluster to be created nodelist - List with nodes where setup the cluster enable - whether cluster service shall start on boot put_standby - whether [nodeslist] shall be put to standby mode Returns: None. Exceptions: ClusterCreateError: generic exception to catch all creation-related problems. ClusterSetupError: failure happened during setup operation. """ nodes = " ".join(nodelist) cmd_setup = f"pcs cluster setup --start --name {cluster_name} {nodes}" cmd_standby = f"pcs node standby {nodes}" cmd_stonith = "pcs property set stonith-enabled=False" cmd_enable = f"pcs cluster enable {nodes}" cmdlist = [cmd_setup] if enable: cmdlist.append(cmd_enable) if put_standby: cmdlist.append(cmd_standby) cmdlist.append(cmd_stonith) try: for s in cmdlist: SimpleCommand().run_cmd(s) except Exception: raise ClusterSetupError("Failed to setup the cluster")
def sspl(cib_xml, push=False): """Create sspl clone resource in pacemaker.""" # Using sspl-ll service file according to the content of SSPL repo cmd_sspl = f"pcs -f {cib_xml} resource create sspl-ll systemd:sspl-ll clone op monitor interval=30" SimpleCommand().run_cmd(cmd_sspl) if push: cib_push(cib_xml)
def __init__(self, args: dict): """ Init method. """ self._url = args.config Conf.load(self._index, self._url) self._args = args.args self._execute = SimpleCommand()
class IemGenerator: ''' Module responsible for constrcting an IEC and sending it to syslog ''' def __init__(self): """ Init IEM generator """ self._execute = SimpleCommand() with open(IEM_SCHEMA, 'r') as iem_schema_file: self.iem_alert_data = json.load(iem_schema_file) def generate_iem(self, node: str, module: str, event_type: str) -> None: ''' Forms an IEC based on diffrent values such as module:<node/resource> and event_type<lost/member for a node scenario> IEC code: IEC:{severity}{source}{component}{module_id}{event_id}:{desciption} severity of the event. source (Hardware or Software) of the event component who is generating an IEM event_id: unique identification of the event (like node lost or node now became member) module: sub-component of the module who generated an IEM Ex: IEC:WS0080010001: node is down(node lost) IEC:IS0080010002: node is up(node is now member) Required parameters node : Node name module : Module type (ex 'node' or 'resource' ) event_type : Type of event based on module ( ex 'member' / 'lost' when module is 'node' ) ''' try: module_type = self.iem_alert_data.get(module) severity = module_type.get('severity').get(event_type) source = module_type.get('source') component = module_type.get('component') module_id = module_type.get('module') event_id = module_type.get('event').get(event_type).get('ID') desc = module_type.get('event').get(event_type).get('desc') desciption = re.sub("\$host", node, desc) desciption = re.sub("\$status", event_type, desciption) iec_string = f'"IEC:{severity}{source}{component}{module_id}{event_id}:{desciption}"' iec_command = ALERTS.logger_utility_iec_cmd + ' ' + iec_string Log.info(f'Sending an IEC: {iec_string} to syslog') _output, _err, _rc = self._execute.run_cmd(iec_command, check_error=False) if _rc != 0 or _err: raise Exception(f'Failed to populate an IEC to syslog: {_err}') except KeyError as kerr: Log.error( f'Key Error occured while parsing the IEM data while generating \ an IEC for {module} for the event {event_type}: {kerr}' ) except Exception as err: Log.error(f'Problem occured while generating an IEC for {module} \ for the event {event_type}: {err}')
def __init__(self): """ PcsCluster manage pacemaker/corosync cluster """ super(PcsClusterManager, self).__init__() self._execute = SimpleCommand() # get version from ha.conf self._version = ConfigManager.get_major_version() if self._version == const.CORTX_VERSION_1: self._decision_monitor = DecisionMonitor() # TODO: add node_manager class to handle query self._refresh_contex = PcsRefreshContex(self._decision_monitor) # TODO move node logic to node manager class self._node_status = [ 'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected' ]
def delete_resources() -> None: """ Delete pacemaker resources. Exceptions: UpgradeError """ try: resources = _get_resource_list() Log.info(f"Going to delete following resources: {resources}") for r in resources: Log.info(f"Deleting resource {r}") SimpleCommand().run_cmd( PCS_DELETE_RESOURCE.replace("<resource>", r)) SimpleCommand().run_cmd(PCS_CLEANUP) Log.info("Wait 2 min till all resource deleted.") is_resource_deleted(120) except Exception as err: raise UpgradeError("Resource deletion failed")
def _get_pcs_status(self): """ Get status of the cluster using "pcs status --full xml" command. """ self._initialize_node_health() error = None try: self._output, error, rc = SimpleCommand().run_cmd(PcsConstants.PCS_STATUS_XML) except Exception: Log.info("Failed to run pcs status on current node.") rc = 1 Log.info(f"pcs status : rc = {rc}, error = {error}") if rc != 0: self._output = self._get_pcs_status_remote() if self._output is not None: self._output = ElementTree.fromstring(self._output)
def __init__(self, args: dict): """ Init method. """ if args is not None: self._url = args.config self._service = args.services Conf.load(self._index, self._url) self._args = args.args self._confstore = None self._execute = SimpleCommand()
def mgmt_resources(cib_xml, push=False): """Create mandatory resources for mgmt stack.""" kibana = f"pcs -f {cib_xml} resource create kibana systemd:kibana op monitor interval=30s" agent = f"pcs -f {cib_xml} resource create csm-agent systemd:csm_agent op monitor interval=30s" web = f"pcs -f {cib_xml} resource create csm-web systemd:csm_web op monitor interval=30s" for c in (kibana, agent, web): SimpleCommand().run_cmd(c) if push: cib_push(cib_xml)
def mgmt_vip(cib_xml, vip, iface, cidr=24, push=False): """Create mgmt Virtual IP resource.""" cmd = f"pcs -f {cib_xml} resource create mgmt-vip ocf:heartbeat:IPaddr2 \ ip={vip} cidr_netmask={cidr} nic={iface} iflabel=v1 \ op start interval=0s timeout=60s \ op monitor interval=5s timeout=20s \ op stop interval=0s timeout=60s" SimpleCommand().run_cmd(cmd) if push: cib_push(cib_xml)
def mgmt_stack(cib_xml, mgmt_vip_cfg, with_uds=False, push=False): """Create Mgmt stack related resources. It also creates and defines management group to support colocation and ordering requirements. """ mgmt_resources(cib_xml) if with_uds: uds(cib_xml) cmd = f"pcs -f {cib_xml} resource group add management kibana csm-agent csm-web" SimpleCommand().run_cmd(cmd) if mgmt_vip_cfg: mgmt_vip(cib_xml, **mgmt_vip_cfg, push=False) cmd_group = f"pcs -f {cib_xml} resource group add management kibana --before csm-agent" SimpleCommand().run_cmd(cmd_group) if push: cib_push(cib_xml)
def _is_cluster_standby_on() -> None: '''Check if cluster is in standby mode. If not, make standby mode ON''' Log.info('Check cluster is in standby mode') value = SimpleCommand().run_cmd(CHECK_PCS_STANDBY_MODE) standby_value = value[0].split(' ')[3].strip('\n').split('=') if standby_value[1].lower() != 'on': Log.warn('cluster is not in standby mode.') Log.info('switching the cluster in standby mode for performing post upgrade routines') _switch_cluster_mode(PCS_CLUSTER_STANDBY) Log.info('#### All post-upgrade prerequisites are in place ####')