Exemplo n.º 1
0
 def del_attr(resource: str):
     try:
         executor = SimpleCommand()
         executor.run_cmd(f"attrd_updater -U 0 -n {resource}",
                          check_error=False)
     except Exception as e:
         Log.error(
             f"Problem in deleting attr - resource: {resource}, Error: {e}")
    def update_attr(scope: str, resource: str, instances_per_node: int, node_name:str = None):
        if scope == AttribScope.CLUSTER:
            node_name = None

        try:
            count: int = AttribUpdater.get_count(resource, instances_per_node, node_name)
            if count >= 0:
                attrib_name: str = resource + "-count"
                executor = SimpleCommand()
                executor.run_cmd(f"attrd_updater -U {count} -n {attrib_name}", check_error=False)
        except Exception as e:
            Log.error(f"Problem in updating attr - count of resource: {resource}. Error: {e}")
Exemplo n.º 3
0
class NodeAlertMonitor(AlertMonitor):

    def __init__(self):
        """
        Init node alert monitor
        """
        super(NodeAlertMonitor, self).__init__()
        self.process = SimpleCommand()

    def _get_online_nodes(self):
        """
        Get list of online nodes ids.
        """
        online_nodes_xml = self.process.run_cmd(const.GET_ONLINE_NODES_CMD)
        # create element tree object
        root = ET.fromstring(online_nodes_xml[0])
        nodes_ids = []
        # iterate news items
        for item in root.findall('nodes'):
            # iterate child elements of item
            for child in item:
                if child.attrib['online'] == 'true':
                    nodes_ids.append(child.attrib['id'])
        Log.info(f"List of online nodes ids in cluster in sorted ascending order: {sorted(nodes_ids)}")
        return sorted(nodes_ids)

    def _get_local_node(self):
        """
        Get Local node name and id.
        """
        local_node_id = self.process.run_cmd(const.GET_LOCAL_NODE_ID_CMD)
        local_node_name = self.process.run_cmd(const.GET_LOCAL_NODE_NAME_CMD)
        Log.info(f"Local node name: {local_node_name[0]} \n Local node id: {local_node_id[0]}")
        return local_node_id[0], local_node_name[0]

    def process_alert(self):
        Log.debug("Processing event for NodeAlertMonitor")
        # Environment variable are available in self.crm_env
        self.iem = IemGenerator()
        # Get online nodeids from corosync.
        nodes_ids = self._get_online_nodes()
        local_node_id, local_node_name = self._get_local_node()
        # Generate and send IEM only through the highest online node in cluster.
        if nodes_ids[-1].strip() == local_node_id.strip():
            self.iem.generate_iem(self.crm_env["CRM_alert_node"], self.alert_event_module, self.alert_event_type)
            Log.info(f"Sent IEM alert from the node - name: {local_node_name}, id: {local_node_id}")
        else:
            Log.debug(
                f"This node does not have highest id. Local node id : {local_node_id}, all nodes: {nodes_ids.sort()}.")
Exemplo n.º 4
0
 def get_machine_id():
     execute = SimpleCommand()
     command = "cat /etc/machine-id"
     machine_id, err, rc = execute.run_cmd(command, check_error=True)
     Log.info(
         f"Read machine-id. Output: {machine_id}, Err: {err}, RC: {rc}")
     return machine_id.strip()
Exemplo n.º 5
0
class IemGenerator:
    '''
    Module responsible for constrcting an IEC and sending it to syslog
    '''
    def __init__(self):
        """
        Init IEM generator
        """
        self._execute = SimpleCommand()
        with open(IEM_SCHEMA, 'r') as iem_schema_file:
            self.iem_alert_data = json.load(iem_schema_file)

    def generate_iem(self, node: str, module: str, event_type: str) -> None:
        '''
           Forms an IEC based on diffrent values such as module:<node/resource>
           and event_type<lost/member for a node scenario>
           IEC code:
           IEC:{severity}{source}{component}{module_id}{event_id}:{desciption}
           severity of the event.
           source (Hardware or Software) of the event
           component who is generating an IEM
           event_id: unique identification of the event (like node lost or node now became member)
           module: sub-component of the module who generated an IEM
           Ex:
           IEC:WS0080010001: node is down(node lost)
           IEC:IS0080010002: node is up(node is now member)

           Required parameters
           node : Node name
           module : Module type (ex 'node' or 'resource' )
           event_type : Type of event based on module ( ex 'member' / 'lost' when module is 'node' )
        '''
        try:
            module_type = self.iem_alert_data.get(module)
            severity = module_type.get('severity').get(event_type)
            source = module_type.get('source')
            component = module_type.get('component')
            module_id = module_type.get('module')
            event_id = module_type.get('event').get(event_type).get('ID')
            desc = module_type.get('event').get(event_type).get('desc')
            desciption = re.sub("\$host", node, desc)
            desciption = re.sub("\$status", event_type, desciption)
            iec_string = f'"IEC:{severity}{source}{component}{module_id}{event_id}:{desciption}"'
            iec_command = ALERTS.logger_utility_iec_cmd + ' ' + iec_string
            Log.info(f'Sending an IEC: {iec_string} to syslog')

            _output, _err, _rc = self._execute.run_cmd(iec_command,
                                                       check_error=False)
            if _rc != 0 or _err:
                raise Exception(f'Failed to populate an IEC to syslog: {_err}')
        except KeyError as kerr:
            Log.error(
                f'Key Error occured while parsing the IEM data while generating \
                        an IEC for {module} for the event {event_type}: {kerr}'
            )
        except Exception as err:
            Log.error(f'Problem occured while generating an IEC for {module} \
                        for the event {event_type}: {err}')
    def get_count(resource: str, instances_per_node: int, node_name: str = None):
        count = 0
        try:
            executor = SimpleCommand()
            for instance in range(1, instances_per_node+1):
                service = resource + "-" + str(instance)
                out, _, _ = executor.run_cmd(f"attrd_updater -Q -A -n {service}", check_error=False)
                count += AttribUpdater._get_count_from_output(out, node_name)
        except Exception as e:
            Log.error(f"Problem in fetching attr. resource: {resource}. Error: {e}")
            count = -1

        return count
Exemplo n.º 7
0
class PcsClusterManager(ClusterManager):
    def __init__(self):
        """
        PcsCluster manage pacemaker/corosync cluster
        """
        super(PcsClusterManager, self).__init__()
        self._execute = SimpleCommand()
        self._decision_monitor = DecisionMonitor()
        # TODO: add node_manager class to handle query
        self._refresh_contex = PcsRefreshContex(self._decision_monitor)
        # TODO move node logic to node manager class
        self._node_status = [
            'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected'
        ]

    def process_request(self, action, args, output):
        """
        Generic method to handle process request

        Args:
            action ([string]): Take cluster action for each request.
            args ([dict]): Parameter pass to request to process.
        """
        # TODO Add validater
        # TODO Optimize if else
        if action == const.CLUSTER_COMMAND:
            if args.cluster_action == "add_node":
                self.add_node(args.node)
            elif args.cluster_action == "remove_node":
                self.remove_node(args.node)
            elif args.cluster_action == "start":
                self.start()
            elif args.cluster_action == "stop":
                self.stop()
            elif args.cluster_action == "status":
                self.status()
            elif args.cluster_action == "shutdown":
                self.shutdown()
        elif action == const.NODE_COMMAND:
            self._refresh_contex.process_request(action, args)
        else:
            raise HAUnimplemented()

    def node_status(self, node):
        """
        Check node status
        If node not detected return rc as 1 else 0
        Node status:
         Online:
         Standby:
         Maintenance:
         Offline:
        """
        Log.debug(f"Check {node} node status")
        # TODO: check is node is valid
        # TODO move node logic to node manager class
        _output, _err, _rc = self._execute.run_cmd("pcs status nodes")
        for status in _output.split("\n"):
            if node in status.split():
                node_rc = 0
                node_status = (status.split()[0])[:-1]
                Log.debug(
                    f"For {node} node rc: {node_rc}, status: {node_status}")
                return node_rc, node_status
        Log.debug(
            f"{node} is not detected in cluster, treating as disconnected node"
        )
        return 1, "Disconnected"

    def remove_node(self, node):
        """
        Remove node from pcs cluster
        """
        # TODO: Limitation for node remove (in cluster node cannot remove it self)
        # Check if node already removed
        _rc, status = self.node_status(node)
        if _rc != 1:
            self._execute.run_cmd(f"pcs cluster node remove {node} --force")
            _rc, status = self.node_status(node)
            Log.debug(f"For node {node} status: {status}, rc: {_rc}")
            if _rc != 1:
                Log.error(f"Failed to remove {node}")
                raise Exception(f"Failed to remove {node}")
            else:
                Log.info(f"Node {node} removed from cluster")
        else:
            Log.info(f"Node {node} already removed from cluster")

    def add_node(self, node):
        """
        Add new node to pcs cluster
        """
        # TODO: Limitation for node add (in cluster node cannot add it self)
        commands = [
            f"pcs cluster node add {node}", "pcs resource cleanup --all",
            f"pcs cluster enable {node}", f"pcs cluster start {node}"
        ]
        _rc, status = self.node_status(node)
        if _rc != 0:
            for command in commands:
                self._execute.run_cmd(command)
            time.sleep(20)
            _rc, status = self.node_status(node)
            Log.debug(f"{node} status rc: {_rc}, status: {status}")
            if status != 'Online':
                Log.error(f"Failed to add {node}")
                raise Exception(f"Failed to add {node}")
            else:
                Log.info(f"Node {node} added to cluster")
        else:
            Log.info(f"Node {node} already added to cluster")

    def start(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented()

    def stop(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented()

    def status(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented()

    def shutdown(self):
        raise HAUnimplemented()
Exemplo n.º 8
0
class CortxClusterManager:
    def __init__(self):
        """
        Manage cluster operation
        """
        self._execute = SimpleCommand()

    def process_request(self, action, args, output):
        """
        Process cluster request

        Args:
            action (string): action taken on cluster
            args (dictonery): parameteter
            output (object): Store output

        Raises:
            HAUnimplemented: [description]
        """
        # TODO: Provide service and node management
        self._output = output
        if action == const.CLUSTER_COMMAND:
            getattr(self, args.cluster_action)()

    def remove_node(self):
        raise HAUnimplemented("Cluster remove node is not supported.")

    def add_node(self):
        raise HAUnimplemented("Cluster add node is not supported.")

    def start(self):
        Log.debug("Executing cluster start")
        _output, _err, _rc = self._execute.run_cmd(const.HCTL_START,
                                                   check_error=False)
        Log.info(
            f"IO stack started. Output: {_output}, Err: {_err}, RC: {_rc}")
        self.status()
        if self._output.get_rc() == 0:
            Log.info("Cluster started successfully")
            self._output.output("Cluster started successfully")
            self._output.rc(0)
        else:
            Log.error("Cluster failed to start")
            self._output.output("Cluster failed to start")
            self._output.rc(1)

    def stop(self):
        Log.info("Executing cluster Stop")
        _output, _err, _rc = self._execute.run_cmd(const.HCTL_STOP,
                                                   check_error=False)
        Log.info(
            f"Io stack stopped successfully. Output: {_output}, Err: {_err}, RC: {_rc}"
        )
        self.status()
        if self._output.get_rc() == 1:
            Log.info("Cluster stopped successfully")
            self._output.output("Cluster stopped successfully...")
            self._output.rc(0)
        else:
            Log.error("Cluster failed to stop")
            self._output.output("Cluster failed to stop")
            self._output.rc(1)

    def status(self):
        _output, _err, _rc = self._execute.run_cmd(const.HCTL_STATUS,
                                                   check_error=False)
        self._output.rc(_rc)
        status = const.HCTL_STARTED_STATUS if _rc == 0 else const.HCTL_STOPPED_STATUS
        self._output.output(status)

    def shutdown(self):
        raise HAUnimplemented("Cluster shutdown is not supported.")
Exemplo n.º 9
0
class AlertConfig:

    ALERT_SCRIPT_PATH = ["/usr/local/bin/pcmk_alert", "/usr/bin/pcmk_alert"]
    ALERT_ID = "iem_alert"
    RECIPIENT_KEY = "sender_type"
    RECIPIENT_VALUE = "syslog"

    def __init__(self):
        self._process = SimpleCommand()

    def is_alert_exists(self) -> bool:
        """
        Check if alert already exists.
        """
        Log.info("Checking pacemaker alert if already exists ...")
        output, _, rc = self._process.run_cmd("pcs alert")
        if rc != 0:
            raise AlertConfigError("Failed to execute pcs alert.")
        for line in output.split("\n"):
            if AlertConfig.ALERT_ID in line:
                return True
        return False

    def create_alert(self):
        """
        Manage alert resource.
        """
        Log.info("Creating pacemaker alert ...")
        if self.is_alert_exists():
            Log.info("Alert already exists, skipping create alert.")
            return
        path: str = AlertConfig._get_script()
        try:
            self._process.run_cmd(f"pcs alert create id={AlertConfig.ALERT_ID} description=send_iem_alerts path={path}")
            self._process.run_cmd(f"pcs alert recipient add {AlertConfig.ALERT_ID} id={AlertConfig.RECIPIENT_KEY} value={AlertConfig.RECIPIENT_VALUE}")
            Log.info(f"Alert {AlertConfig.ALERT_ID} created successfully.")
        except Exception as e:
            raise AlertConfigError(f"Failed to create alert {AlertConfig.ALERT_ID} Error: {e}")

    def delete_alert(self):
        """
        Delete alert on current node.
        """
        if not self.is_alert_exists():
            return
        Log.info("Deleating pacemaker alert ...")
        self._process.run_cmd(f"pcs alert remove {AlertConfig.ALERT_ID}")
        Log.info(f"Alert {AlertConfig.ALERT_ID} is deleted")

    @staticmethod
    def _get_script() -> str:
        """
        Get alert script

        Raises:
            CreateResourceConfigError: Raise error if script missing
        """
        path: str = None
        for script in AlertConfig.ALERT_SCRIPT_PATH:
            if os.path.isfile(script):
                path = script
                break
        if path is None:
            raise AlertConfigError(f"Failed to create alert missing {str(AlertConfig.ALERT_SCRIPT_PATH)} file.")
        return path
Exemplo n.º 10
0
class Cleanup:
    def __init__(self, decision_monitor):
        """
        Description: Cleanup Event
        """
        self._decision_monitor = decision_monitor
        self._execute = SimpleCommand()

    def cleanup_db(self, node, data_only):
        """
        Args:
            node ([string]): Node name.
            data_only ([boolean]): Remove data only.

        Action:
            consul data:
                {'entity': 'enclosure', 'entity_id': '0',
                'component': 'controller', 'component_id': 'node1'}
            if data_only is True then remove data else remove
            data and perform cleanup.
        """
        resources = Conf.get(const.RESOURCE_GLOBAL_INDEX, "resources")
        node = "all" if node is None else node
        Log.debug(f"Performing cleanup for {node} node")
        for key in resources.keys():
            if node == "all":
                self._decision_monitor.acknowledge_resource(key, data_only)
            elif node in key:
                self._decision_monitor.acknowledge_resource(key, data_only)
            else:
                pass
        if not data_only:
            Log.info(f"Reseting HA decision event for {node}")
            self.reset_failover(node)

    def is_cleanup_required(self, node=None):
        """
        Check if all alert resolved

        Args:
            node ([type]): [description]
        """
        node = "all" if node is None else node
        Log.debug(f"Performing failback on {node}")
        resource_list = Conf.get(const.RESOURCE_GLOBAL_INDEX, "resources")
        status_list = {}
        for resource in resource_list:
            if node == "all":
                status_list[
                    resource] = self._decision_monitor.get_resource_status(
                        resource)
            elif node in resource:
                status_list[
                    resource] = self._decision_monitor.get_resource_status(
                        resource)
            else:
                pass
        Log.info(f"Resource status for node {node} is {status_list}")
        if Action.FAILED in status_list.values():
            Log.debug("Some component are not yet recovered skipping failback")
        elif Action.RESOLVED in status_list.values():
            Log.info("Failback is required as some of alert are resolved.")
            return True
        else:
            Log.debug(
                f"{node} node already in good state no need for failback")
        return False

    def reset_failover(self, node=None, soft_cleanup=False):
        """
        Cleanup pacemaker failcount to allow failback.
        """
        node = "all" if node is None else node
        cmd = const.PCS_CLEANUP if node == "all" else const.PCS_CLEANUP + f" --node {node}"
        if soft_cleanup:
            if self.is_cleanup_required(node):
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_FAILCOUNT_STATUS)
                Log.info(
                    f"Resource failcount before Failback: {_output}, Error:{_err}, RC:{_rc}"
                )
                _output, _err, _rc = self._execute.run_cmd(cmd)
                Log.info(
                    f"Failback is happened, Output:{_output}, Error:{_err}, RC:{_rc}"
                )
                _output, _err, _rc = self._execute.run_cmd(
                    const.PCS_FAILCOUNT_STATUS)
                Log.info(
                    f"Resource failcount after Failback: {_output}, Error:{_err}, RC:{_rc}"
                )
            else:
                Log.debug(
                    "cleanup is not required alerts are not yet resolved.")
        else:
            self._execute.run_cmd(cmd)
        Log.debug(f"Status: {self._execute.run_cmd(const.PCS_STATUS)}")
Exemplo n.º 11
0
class PcsController(ElementController):
    """ Generic Controller for Pcs to execute common pcs command """

    def __init__(self):
        """
        Initialize pcs controller
        """
        super(PcsController, self).__init__()
        self._execute = SimpleCommand()
        self._confstore = ConfigManager.get_confstore()

    def _check_non_empty(self, **kwargs):
        """
        Check if params are not empty.

        Raises:
            ClusterManagerError: [description]
        """
        for key in kwargs.keys():
            if kwargs[key] is None or kwargs[key] == "":
                raise ClusterManagerError(f"Failed: Invalid parameter, {key} cannot be empty.")

    @staticmethod
    def load_json_file(json_file):
        """
        Load json file to read node & the cluster details to auth node
        :param json_file:
        """
        try:
            with open(json_file) as f:
                return json.load(f)
        except Exception as e:
            raise ClusterManagerError(f"Error in reading desc_file, reason : {e}")

    def heal_resource(self, node_id):
        """
        Heal the resources if there are any fail count exists
        """
        count = 0
        resources_healed = False
        while True:
            if count >= const.RETRY_COUNT:
                break
            fail_count_exists = self.check_resource_failcount(node_id)
            if fail_count_exists:
                self.clean_failure_count(node_id)
            else:
                resources_healed = True
                break
            count += 1
            time.sleep(10)
        return resources_healed

    def check_resource_failcount(self, node_id) -> bool:
        """
        Resource fail count check
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_FAILCOUNT_STATUS, check_error=False)
        if node_id in _output:
            return True
        else:
            return False

    def clean_failure_count(self, node_id):
        """
        Cleanup resources fail count
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_NODE_CLEANUP.replace("<node>", node_id),
                                                   check_error=False)

    def _get_cluster_size(self):
        """
        Auth node to add
        """
        try:
            _output, _err, _rc = self._execute.run_cmd(const.PCS_CLUSTER_PCSD_STATUS)
            return len(_output.split("\n"))
        except Exception as e:
            raise ClusterManagerError(f"Unable to get cluster : with reason : {e}")

    def _get_node_list(self) -> list:
        """
        Return list of nodes.
        """
        #TODO: This is temporary implementation and It should be removed once nodelist is available in the system health.
        nodelist = []
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES, check_error=False)

        if _rc != 0:
            raise ClusterManagerError("Failed to get nodes status")
        for status in _output.split("\n"):
            nodes = status.split(":")
            if len(nodes) > 1:
                nodelist.extend(nodes[1].split())
        return nodelist

    def nodes_status(self, nodeids: list = None) -> dict:
        """
        Get pcs status of nodes.
        Args:
            nodeids (list): List of Node IDs from cluster nodes.
                Default provide list of all node status.
                if 'local' then provide local node status.

        Returns:
            ([dict]): Return dictionary. {"node_id1": "status of node_id1",
                                          "node_id2": "status of node_id2"...}
        """
        nodeids = self._get_node_list() if nodeids == None or len(nodeids) == 0 else nodeids
        all_nodes_status = dict()
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES, check_error=False)
        if not isinstance(nodeids, list):
            raise ClusterManagerError(f"Invalid nodeids type `{type(nodeids)}`, required `list`")
        for nodeid in nodeids:
            if nodeid in _output:
                for status in _output.split("\n"):
                    nodes = status.split(":")
                    if len(nodes) > 1 and nodeid.lower() in nodes[1].strip().lower():
                        if nodes[0].strip().lower() == NODE_STATUSES.STANDBY.value:
                            all_nodes_status[nodeid] = NODE_STATUSES.STANDBY.value
                        elif nodes[0].strip().lower() == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
                            all_nodes_status[nodeid] = NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value
                        elif nodes[0].strip().lower() == NODE_STATUSES.MAINTENANCE.value:
                            all_nodes_status[nodeid] = NODE_STATUSES.MAINTENANCE.value
                        elif nodes[0].strip().lower() == NODE_STATUSES.CLUSTER_OFFLINE.value:
                            all_nodes_status[nodeid] = NODE_STATUSES.CLUSTER_OFFLINE.value
                        elif nodes[0].strip().lower() == NODE_STATUSES.ONLINE.value:
                            all_nodes_status[nodeid] = NODE_STATUSES.ONLINE.value
                        break
                else:
                    all_nodes_status[nodeid] = NODE_STATUSES.UNKNOWN.value
            else:
                raise HAInvalidNode(f"Node {nodeid} is not a part of cluster")
        for node in all_nodes_status.keys():
            status = all_nodes_status[node]
            if status == NODE_STATUSES.CLUSTER_OFFLINE.value:
                _output, _err, _rc = self._execute.run_cmd(f"ping -c 1 {node}", check_error=False)
                if _rc != 0:
                    all_nodes_status[node] = NODE_STATUSES.POWEROFF.value
        return all_nodes_status

    def _get_filtered_nodes(self, status: list) -> list:
        """
        List of node matches from above status.

        Args:
            status (list): Status list of node

        Returns:
            list: List of node
        """
        nodelist = []
        node_status = self.nodes_status()
        for node in node_status.keys():
            if node_status[node] in status:
                nodelist.append(node)
        return nodelist

    def is_valid_node_id(self, node_id) -> bool:
        '''
           Checks if node id gets resolved to some IP address or not
           Returns: bool
           Exception: socket.gaierror, socket.herror
        '''
        # TODO: change this logic and validate the node_id from the
        # list coming from system health
        ip_validator_regex = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$"

        # node_id can be passed as IP address or FQDN or
        # some random number or just sequence of chars

        # first seperate the string from dots.
        splitted_node_id = node_id.replace('.', '')

        # If string only contains numbers, means it can be just
        # random number or can be an IP address. So, IP address
        # validation can be done. else for random number, exception will be
        # raised
        if re.search('^[0-9]*$', splitted_node_id):
            if re.search(ip_validator_regex, node_id):
                return True
            raise HAClusterCLIError(f'{node_id} is not a valid node_id')
        # else it can be combination of chars and numbers means hostname or just a
        # random meaningless string
        else:
            try:
                socket.gethostbyname(node_id)
            except Exception as err:
                raise HAClusterCLIError(f'{node_id} not a valid node_id: {err}')
        return True

    def _is_node_in_cluster(self, node_id: str):
        """
        Checks if node_id present in cluster or not
        Args:
            node_id (str): Private fqdn define in conf store.
        Raises: HAInvalidNode
        If Node is not present in cluster or Node is not valid raise Exception
        """
        if node_id in self._get_node_list():
            return True
        raise HAInvalidNode(f"The node {node_id} is not present in the cluster.")
Exemplo n.º 12
0
class IpmiFencingAgent(FencingAgent):
    """ Tool to manage IPMI-enabled devices """

    NODE_BMC_INFO_KEY = "node_bmc_info"
    IPMI_IPADDR = "ipmi_ipaddr"
    IPMI_USER = "******"
    IPMI_AUTH_KEY = "ipmi_auth_key"

    def __init__(self):
        """
        Initialize IPMI Fencing Agent class.
        """
        super(IpmiFencingAgent, self).__init__()
        self._confstore = ConfigManager.get_confstore()
        self._execute = SimpleCommand()

    def power_off(self, node_id: str):
        """
        Power OFF node with nodeid

        Args:
            node_id (str): private fqdn define in conf store.
        """
        try:
            bmc_info = self._confstore.get(f"{IpmiFencingAgent.NODE_BMC_INFO_KEY}/node/{node_id}")
            if bmc_info is not None:
                _, value = bmc_info.popitem()
                bmc_info_dict = ast.literal_eval(value)
                self._execute.run_cmd(f"ipmitool -I lanplus -H {bmc_info_dict[IpmiFencingAgent.IPMI_IPADDR]} "
                                      f"-U {bmc_info_dict[IpmiFencingAgent.IPMI_USER]} "
                                      f"-P {bmc_info_dict[IpmiFencingAgent.IPMI_AUTH_KEY]} chassis power off")
        except Exception as e:
            raise Exception(f"Failed to run IPMItool Command. Error : {e}")

    def power_on(self, node_id: str):
        """
        Power ON node with nodeid

        Args:
            node_id (str): Node ID from cluster nodes.
        """
        try:
            bmc_info = self._confstore.get(f"{IpmiFencingAgent.NODE_BMC_INFO_KEY}/node/{node_id}")
            if bmc_info is not None:
                _, value = bmc_info.popitem()
                bmc_info_dict = ast.literal_eval(value)
                self._execute.run_cmd(f"ipmitool -I lanplus -H {bmc_info_dict[IpmiFencingAgent.IPMI_IPADDR]} "
                                      f"-U {bmc_info_dict[IpmiFencingAgent.IPMI_USER]} "
                                      f"-P {bmc_info_dict[IpmiFencingAgent.IPMI_AUTH_KEY]} chassis power on")
        except Exception as e:
            raise Exception(f"Failed to run IPMItool Command. Error : {e}")

    def power_status(self, node_id: str) -> str:
        """
        Get power status of node with nodeid

        Args:
            node_id (str): Node ID from cluster nodes.
        """
        try:
            bmc_info = self._confstore.get(f"{IpmiFencingAgent.NODE_BMC_INFO_KEY}/node/{node_id}")
            if bmc_info is not None:
                _, value = bmc_info.popitem()
                bmc_info_dict = ast.literal_eval(value)
                _output, _err, _rc = self._execute.run_cmd(f"ipmitool -I lanplus -H {bmc_info_dict[IpmiFencingAgent.IPMI_IPADDR]} "
                                         f"-U {bmc_info_dict[IpmiFencingAgent.IPMI_USER]} "
                                         f"-P {bmc_info_dict[IpmiFencingAgent.IPMI_AUTH_KEY]} chassis power status")
                if _rc != 0:
                    raise Exception(f"Failed to run IPMItool Command. Error : {_err}")
                if const.SERVER_POWER_STATUS.ON.value in _output.lower():
                    return const.SERVER_POWER_STATUS.ON.value
                elif const.SERVER_POWER_STATUS.OFF.value in _output.lower():
                    return const.SERVER_POWER_STATUS.OFF.value
                else:
                    return const.SERVER_POWER_STATUS.UNKNOWN.value
        except Exception as e:
            raise Exception(f"Failed to run IPMItool Command. Error : {e}")

    def setup_ipmi_credentials(self, ipmi_ipaddr: str, ipmi_user: str, ipmi_password: str, node_name: str):
        """
        Get the BMC credentials & store it in confstore

        """
        bmc_info_keys = {IpmiFencingAgent.IPMI_IPADDR: ipmi_ipaddr, IpmiFencingAgent.IPMI_USER: ipmi_user,
                            IpmiFencingAgent.IPMI_AUTH_KEY: ipmi_password}
        if not self._confstore.key_exists(f"{IpmiFencingAgent.NODE_BMC_INFO_KEY}/node/{node_name}"):
            self._confstore.set(f"{IpmiFencingAgent.NODE_BMC_INFO_KEY}/node/{node_name}", json.dumps(bmc_info_keys))
Exemplo n.º 13
0
class PcsController(ElementController):
    """ Generic Controller for Pcs to execute common pcs command """
    def __init__(self):
        """
        Initialize pcs controller
        """
        super(PcsController, self).__init__()
        self._execute = SimpleCommand()

    def _check_non_empty(self, **kwargs):
        """
        Check if params are not empty.

        Raises:
            ClusterManagerError: [description]
        """
        for key in kwargs.keys():
            if kwargs[key] is None or kwargs[key] == "":
                raise ClusterManagerError(
                    f"Failed: Invalid parameter, {key} cannot be empty.")

    @staticmethod
    def load_json_file(json_file):
        """
        Load json file to read node & the cluster details to auth node
        :param json_file:
        """
        try:
            with open(json_file) as f:
                return json.load(f)
        except Exception as e:
            raise ClusterManagerError(
                f"Error in reading desc_file, reason : {e}")

    def heal_resource(self, node_id):
        """
        Heal the resources if there are any fail count exists
        """
        count = 0
        resources_healed = False
        while True:
            if count >= const.RETRY_COUNT:
                break
            fail_count_exists = self.check_resource_failcount(node_id)
            if fail_count_exists:
                self.clean_failure_count(node_id)
            else:
                resources_healed = True
                break
            count += 1
            time.sleep(10)
        return resources_healed

    def check_resource_failcount(self, node_id) -> bool:
        """
        Resource fail count check
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_FAILCOUNT_STATUS,
                                                   check_error=False)
        if node_id in _output:
            return True
        else:
            return False

    def clean_failure_count(self, node_id):
        """
        Cleanup resources fail count
        """
        _output, _err, _rc = self._execute.run_cmd(
            const.PCS_NODE_CLEANUP.replace("<node>", node_id),
            check_error=False)

    def _get_cluster_size(self):
        """
        Auth node to add
        """
        try:
            _output, _err, _rc = self._execute.run_cmd(
                const.PCS_CLUSTER_PCSD_STATUS)
            return len(_output.split("\n"))
        except Exception as e:
            raise ClusterManagerError(
                f"Unable to get cluster : with reason : {e}")

    def _get_node_list(self) -> list:
        """
        Return list of nodes.
        """
        #TODO: This is temporary implementation and It should be removed once nodelist is available in the system health.
        nodelist = []
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES,
                                                   check_error=False)

        if _rc != 0:
            raise ClusterManagerError("Failed to get nodes status")
        for status in _output.split("\n"):
            nodes = status.split(":")
            if len(nodes) > 1:
                nodelist.extend(nodes[1].split())
        return nodelist

    def nodes_status(self, nodeids: list = None) -> dict:
        """
        Get pcs status of nodes.
        Args:
            nodeids (list): List of Node IDs from cluster nodes.
                Default provide list of all node status.
                if 'local' then provide local node status.

        Returns:
            ([dict]): Return dictionary. {"node_id1": "status of node_id1",
                                          "node_id2": "status of node_id2"...}
        """
        nodeids = self._get_node_list(
        ) if nodeids == None or len(nodeids) == 0 else nodeids
        all_nodes_status = dict()
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES,
                                                   check_error=False)
        if not isinstance(nodeids, list):
            raise ClusterManagerError(
                f"Invalid nodeids type `{type(nodeids)}`, required `list`")
        for nodeid in nodeids:
            if nodeid in _output:
                for status in _output.split("\n"):
                    nodes = status.split(":")
                    if len(nodes) > 1 and nodeid.lower() in nodes[1].strip(
                    ).lower():
                        if nodes[0].strip().lower(
                        ) == NODE_STATUSES.STANDBY.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.STANDBY.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.STANDBY_WITH_RESOURCES_RUNNING.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.MAINTENANCE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.MAINTENANCE.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.CLUSTER_OFFLINE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.CLUSTER_OFFLINE.value
                        elif nodes[0].strip().lower(
                        ) == NODE_STATUSES.ONLINE.value:
                            all_nodes_status[
                                nodeid] = NODE_STATUSES.ONLINE.value
                        break
                else:
                    all_nodes_status[nodeid] = NODE_STATUSES.UNKNOWN.value
            else:
                raise HAInvalidNode(f"Node {nodeid} is not a part of cluster")
        for node in all_nodes_status.keys():
            status = all_nodes_status[node]
            if status == NODE_STATUSES.CLUSTER_OFFLINE.value:
                _output, _err, _rc = self._execute.run_cmd(f"ping -c 1 {node}",
                                                           check_error=False)
                if _rc != 0:
                    all_nodes_status[node] = NODE_STATUSES.POWEROFF.value
        return all_nodes_status

    def _get_filtered_nodes(self, status: list) -> list:
        """
        List of node matches from above status.

        Args:
            status (list): Status list of node

        Returns:
            list: List of node
        """
        nodelist = []
        node_status = self.nodes_status()
        for node in node_status.keys():
            if node_status[node] in status:
                nodelist.append(node)
        return nodelist
Exemplo n.º 14
0
class VipHealthMonitor(CortxServiceRA):
    """
    Check VIP health check up and failover if unhealthy.
    """
    def __init__(self):
        """
        Initialize the class.
        """
        super(VipHealthMonitor, self).__init__()
        self._execute = SimpleCommand()

    @staticmethod
    def metadata() -> int:
        """
        Provide meta data for resource agent and parameter
        """
        env: str = r"""<?xml version="1.0"?>
        <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
        <resource-agent name="vip_health_check">
        <version>1.0</version>

        <longdesc lang="en">
            Check health of vip.
        </longdesc>
        <shortdesc lang="en">Health Check</shortdesc>
        <parameters>
        <parameter name="vip" required="1">
        <longdesc lang="en"> VIP that need monitor </longdesc>
        <shortdesc lang="en"> VIP </shortdesc>
        <content type="string"/>
        </parameter>
        <parameter name="nic" required="1">
        <longdesc lang="en"> NIC interface </longdesc>
        <shortdesc lang="en"> NIC interface </shortdesc>
        <content type="string"/>
        </parameter>
        </parameters>
        <actions>
        <action name="start"        timeout="3s" />
        <action name="stop"         timeout="3s" />
        <action name="monitor"      timeout="3s" interval="60s" depth="0" />
        <action name="meta-data"    timeout="4s" />
        </actions>
        </resource-agent>
        """
        sys.stdout.write(env)
        return const.OCF_SUCCESS

    def start(self) -> int:
        """
        Start monitoring.
        """
        rc: int = self.monitor()
        Log.info(f"start action on vip monitoring with rc: {rc}")
        return rc

    def stop(self) -> int:
        """
        Stop Vip.
        """
        return const.OCF_SUCCESS

    def monitor(self) -> int:
        """
        Monitor vip
        """
        res_param = self.get_env()
        vip: str = res_param["OCF_RESKEY_vip"]
        nic: str = res_param["OCF_RESKEY_nic"]
        output, error, rc = self._execute.run_cmd(f"ip a s {nic}")
        if rc != 0:
            Log.error(f"Failed to get ip address for {nic} with error {error}")
            return const.OCF_ERR_GENERIC
        status_str = output.split("\n")[0].split(" ")
        status = status_str[status_str.index("state") + 1]
        if status != "UP":
            Log.error(f"VIP Health failed, {nic} is down")
            return const.OCF_ERR_GENERIC
        ip_list = []
        for line in output.split("\n"):
            if len(line.split()) > 0 and "inet" in line.split(
            )[0] and vip not in line.split():
                ip = line.split()[1].split("/")[0]
                ip_list.append(str(ip))
        if len(ip_list) < 1:
            Log.error(f"VIP Health failed, {nic} is down")
            return const.OCF_ERR_GENERIC
        return const.OCF_SUCCESS
Exemplo n.º 15
0
class Cmd:
    """
    Setup Command. This class provides methods for parsing arguments.
    """
    _index = "conf"
    DEV_CHECK = False
    LOCAL_CHECK = False
    PROV_CONFSTORE = "provisioner"
    HA_CONFSTORE = "confstore"

    def __init__(self, args: dict):
        """
        Init method.
        """
        self._url = args.config
        Conf.load(self._index, self._url)
        self._args = args.args
        self._execute = SimpleCommand()
        self._confstore = ConfigManager._get_confstore()
        self._cluster_manager = None

    @property
    def args(self) -> str:
        return self._args

    @property
    def url(self) -> str:
        return self._url

    @staticmethod
    def usage(prog: str):
        """
        Print usage instructions
        """
        sys.stderr.write(
            f"usage: {prog} [-h] <cmd> <--config url> <args>...\n"
            f"where:\n"
            f"cmd   post_install, prepare, config, init, test, upgrade, reset, cleanup, backup, restore\n"
            f"--config   Config URL")

    @staticmethod
    def get_command(desc: str, argv: dict):
        """
        Return the Command after parsing the command line.
        """
        parser = argparse.ArgumentParser(desc)
        subparsers = parser.add_subparsers()
        cmds = inspect.getmembers(sys.modules[__name__])
        cmds = [(x, y) for x, y in cmds if x.endswith("Cmd") and x != "Cmd"]
        for name, cmd in cmds:
            cmd.add_args(subparsers, cmd, name)
        args = parser.parse_args(argv)
        Cmd.DEV_CHECK = args.dev
        Cmd.LOCAL_CHECK = args.local
        return args.command(args)

    @staticmethod
    def add_args(parser: str, cls: str, name: str):
        """
        Add Command args for parsing.
        """
        setup_arg_parser = parser.add_parser(cls.name, help='setup %s' % name)
        setup_arg_parser.add_argument('--config', help='Config URL')
        setup_arg_parser.add_argument('--dev',
                                      action='store_true',
                                      help='Dev check')
        setup_arg_parser.add_argument('--local',
                                      action='store_true',
                                      help='Local check')
        setup_arg_parser.add_argument('args',
                                      nargs='*',
                                      default=[],
                                      help='args')
        setup_arg_parser.set_defaults(command=cls)

    @staticmethod
    def remove_file(file: str):
        """
        Check if file exist and delete existing file.

        Args:
            file ([str]): File or Dir name to be deleted.
        """
        if os.path.exists(file):
            if os.path.isfile(file):
                os.remove(file)
            elif os.path.isdir(file):
                shutil.rmtree(file)
            else:
                raise SetupError(
                    f"{file} is not dir and file, can not be deleted.")

    @staticmethod
    def copy_file(source: str, dest: str):
        """
        Move file source to destination.

        Args:
            source (str): [description]
            dest (str): [description]
        """
        Cmd.remove_file(dest)
        shutil.copyfile(source, dest)

    def get_machine_id(self):
        command = "cat /etc/machine-id"
        machine_id, err, rc = self._execute.run_cmd(command, check_error=True)
        Log.info(
            f"Read machine-id. Output: {machine_id}, Err: {err}, RC: {rc}")
        return machine_id.strip()

    def get_node_name(self):
        machine_id = self.get_machine_id()
        node_name = Conf.get(
            self._index, f"server_node.{machine_id}.network.data.private_fqdn")
        Log.info(f"Read node name: {node_name}")
        return node_name

    def get_nodelist(self, fetch_from: str = None) -> list:
        """
        Get nodelist from provisioner or confstore

        Args:
            fetch_from (str): Options from where to fetch.

        Returns:
            list: List of nodes.
        """
        nodelist: list = []
        fetch_from = Cmd.HA_CONFSTORE if fetch_from is None else fetch_from
        if fetch_from == Cmd.HA_CONFSTORE:
            cluster_nodes = self._confstore.get(
                const.CLUSTER_CONFSTORE_NODES_KEY)
            if cluster_nodes is None:
                return nodelist
            for key in cluster_nodes:
                nodelist.append(key.split('/')[-1])
        elif fetch_from == Cmd.PROV_CONFSTORE:
            nodes_schema = Conf.get(self._index, "server_node")
            machine_ids: list = list(nodes_schema.keys())
            for machine in machine_ids:
                nodelist.append(
                    Conf.get(
                        self._index,
                        f"server_node.{machine}.network.data.private_fqdn"))
        else:
            raise SetupError(
                f"Failed to get nodelist, Invalid options {fetch_from}")
        Log.info(
            f"Found total Nodes: {len(nodelist)}, Nodes: {nodelist}, in {fetch_from}"
        )
        return nodelist

    def get_installation_type(self):
        hw_type = ConfigManager.get_hw_env()
        if hw_type is not None:
            install_type = hw_type.lower()
        else:
            Log.error("Error: Can not fetch h/w env from Config.")
            raise HaConfigException("h/w env not present in config.")

        nodes = self.get_nodelist(fetch_from=Cmd.HA_CONFSTORE)
        if len(nodes) == 1 and install_type == const.INSTALLATION_TYPE.VM:
            install_type = const.INSTALLATION_TYPE.SINGLE_VM

        Log.info(f"Nodes count = {len(nodes)}, Install type = {install_type}")

        return install_type

    def standby_node(self, node_name: str) -> None:
        """
        Put node in standby

        Args:
            node_name (str): Node name.
        """
        standby_output: str = self._cluster_manager.node_controller.standby(
            node_name)
        Log.info(f"Put node in standby output: {standby_output}")
        if json.loads(standby_output).get("status") == STATUSES.FAILED.value:
            raise HaConfigException(
                f"Failed to put cluster in standby mode. Error: {standby_output}"
            )

    @staticmethod
    def get_s3_instance(machine_id: str) -> int:
        """
        Return s3 instance

        Raises:
            HaConfigException: Raise exception for in-valide s3 count.

        Returns:
            [int]: Return s3 count.
        """
        try:
            s3_instances = Conf.get(Cmd._index,
                                    f"server_node.{machine_id}.s3_instances")
            if int(s3_instances) < 1:
                raise HaConfigException(
                    f"Found {s3_instances} which is invalid s3 instance count."
                )
            return int(s3_instances)
        except Exception as e:
            Log.error(
                f"Found {s3_instances} which is invalid s3 instance count. Error: {e}"
            )
            raise HaConfigException(
                f"Found {s3_instances} which is invalid s3 instance count.")
Exemplo n.º 16
0
class DynamicFidServiceRA(CortxServiceRA):
    """
    This class is used to provide wrapper around systemd resource agent.
    This class manage fid mapping to (serviceName, NodeId, InstanceId).
    Resource name: (Used to get instance)
        like motr-ios-1, s3server-1, ...
        Here, resource name is used to get instance.
    Service name: (Used for systemd service)
        like s3server, m0d, ...
    Fid service name: (Used for service name in mapping)
        like s3service, ios, confd
    """
    def __init__(self):
        """
        Initialize DynamicFidServiceRA class.
        """
        super(DynamicFidServiceRA, self).__init__()
        self._execute = SimpleCommand()
        self._status_list: list = ["failed", "active", "unknown"]

    def _get_systemd_service(self) -> str:
        """
        Get Service name.

        Returns:
            str: Service name with fid mapping like service@fid
        """
        res_param = self.get_env()
        service: str = res_param["OCF_RESKEY_service"]
        fid_service_name: str = res_param["OCF_RESKEY_fid_service_name"]
        local_node: str = res_param["OCF_RESKEY_CRM_meta_on_node"]
        resource: str = res_param["OCF_RESOURCE_INSTANCE"]
        resource_instance: str = str(resource.split('-')[-1])
        if not resource_instance.isdigit():
            Log.error(f"Invalid resource name {resource}, missing fid")
            sys.exit(const.OCF_ERR_CONFIGURED)
        instance_id: int = int(resource_instance)
        fid = FidManager.get_fid(fid_service_name, local_node, instance_id)
        if fid is None:
            Log.error(f"Invalid config for fid for resource {resource}")
            sys.exit(const.OCF_ERR_CONFIGURED)
        return f"{service}@{fid}"

    def _get_service_status(self, service: str) -> str:
        """
        Monitor service and provide status.
        Command to monitor:
            $ systemctl is-active <service_name>
        Args:
            service (str): Service name

        Returns:
            str: Return service status.
                Service status are one of failed, active, unknown, activating
        """
        output, _, _ = self._execute.run_cmd(f"systemctl is-active {service}",
                                             check_error=False)
        return output

    def metadata(self) -> int:
        """
        Provide meta data for resource agent and parameter
        """
        env: str = r"""<?xml version="1.0"?>
        <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
        <resource-agent name="dynamic_fid_service_ra">
        <version>1.0</version>

        <longdesc lang="en">
        This is resource agent, wrapper around systemd service.
        It map clone to service instance on node.
        Note:
        Only valid resource name accepted with format 'resource_name'-'instance-id'.
        Here instance-id always start with 1 which going to map fid of services.
        Example: s3server-2
        </longdesc>
        <shortdesc lang="en">Systemd wrapper agent</shortdesc>
        <parameters>
        <parameter name="service" required="0">
        <longdesc lang="en"> Service name to manage systemd </longdesc>
        <shortdesc lang="en"> Systemd service </shortdesc>
        <content type="string"/>
        </parameter>
        <parameter name="fid_service_name" required="0">
        <longdesc lang="en"> Fid service name used in mapping </longdesc>
        <shortdesc lang="en"> Systemd service </shortdesc>
        <content type="string"/>
        </parameter>
        </parameters>
        <actions>
        <action name="start"        timeout="40s" />
        <action name="stop"         timeout="40s" />
        <action name="monitor"      timeout="40s" interval="60s" depth="0" />
        <action name="meta-data"    timeout="4s" />
        </actions>
        </resource-agent>
        """
        sys.stdout.write(env)
        return const.OCF_SUCCESS

    def start(self) -> int:
        """
        Start service and provide output.

        Command to start service:
            $ systemctl reset-failed service
            $ systemctl start service

        Returns:
            int: Return as per service status.
                active: return const.OCF_SUCCESS.
                unknown: Wait till timeout.
                failed or timeout will cause failover or moved to Stopped state.
        """
        service = self._get_systemd_service()
        Log.debug(f"Start: Start {service} service")
        self._execute.run_cmd(f"systemctl reset-failed {service}",
                              check_error=False)
        self._execute.run_cmd(f"systemctl start {service}", check_error=False)
        while True:
            Log.debug(f"Start: Starting {service} service")
            status: str = self._get_service_status(service).strip()
            if status == "active":
                break
            elif status == "failed":
                Log.info(
                    f"Start: Failed to start {service} and may cause failover or Stop."
                )
                return const.OCF_ERR_GENERIC
            else:
                time.sleep(1)
                continue
        Log.info(f"Start: Started {service} service")
        return const.OCF_SUCCESS

    def stop(self) -> int:
        """
        Stop service. If stop failed it will cause stonith.

        Returns:
            int: Return as per service status.
                if unknown or failed then return success.
                timeout of stop will cause stonith.
        """
        service = self._get_systemd_service()
        Log.debug(f"Stop: Stopping {service} service")
        self._execute.run_cmd(f"systemctl stop {service}", check_error=False)
        while True:
            status: str = self._get_service_status(service).strip()
            time.sleep(1)
            if status in ["failed", "unknown"]:
                break
        Log.info(f"Stop: Stopped {service} service")
        return const.OCF_SUCCESS

    def monitor(self) -> int:
        """
        It monitor service with help of pacemaker and return result.

        Args:
            service_name (str): Systemd service name. Defaults to None.

        Returns:
            int: Return service status to pacemaker.
                const.OCF_NOT_RUNNING: Service not running.
                const.OCF_ERR_GENERIC: Service is failed.
                const.OCF_SUCCESS: Service is running.
                Monitor timeout will cause restart.
        """
        service: str = self._get_systemd_service()
        Log.debug(f"Monitor: Monitoring of service: {service}")
        while True:
            status: str = self._get_service_status(service).strip()
            if status == "active":
                break
            elif status == "failed":
                Log.debug(f"Monitor: failed to monitor {service}")
                return const.OCF_ERR_GENERIC
            elif status == "unknown":
                Log.debug(f"Monitor: Service {service} is not started yet...")
                return const.OCF_NOT_RUNNING
            else:
                # wait if there is unstable status like activating, deactivating
                time.sleep(1)
                continue
        return const.OCF_SUCCESS
Exemplo n.º 17
0
class PcsClusterManager(ClusterManager):
    def __init__(self):
        """
        PcsCluster manage pacemaker/corosync cluster
        """
        super(PcsClusterManager, self).__init__()
        self._execute = SimpleCommand()

        # get version from ha.conf
        self._version = ConfigManager.get_major_version()

        if self._version == const.CORTX_VERSION_1:
            self._decision_monitor = DecisionMonitor()
            # TODO: add node_manager class to handle query
            self._refresh_contex = PcsRefreshContex(self._decision_monitor)
            # TODO move node logic to node manager class
            self._node_status = [
                'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected'
            ]

    def process_request(self, action, args, output):
        """
        Generic method to handle process request

        Args:
            action ([string]): Take cluster action for each request.
            args ([dict]): Parameter pass to request to process.
        """
        self._output = output
        # TODO Add validater
        if action == const.CLUSTER_COMMAND:
            if args.cluster_action in ["add_node", "remove_node"]:
                getattr(self, args.cluster_action)(args.node)
            else:
                getattr(self, args.cluster_action)()
        elif action == const.NODE_COMMAND and self._version == const.CORTX_VERSION_1:
            self._refresh_contex.process_request(action, args)
        elif action == const.BUNDLE_COMMAND:
            HABundle().process_request(action, args, output)
        else:
            raise HAUnimplemented("This feature is not supported...")

    def node_status(self, node):
        """
        Check node status
        If node not detected return rc as 1 else 0
        Node status:
         Online:
         Standby:
         Maintenance:
         Offline:
        """
        Log.debug(f"Check {node} node status")
        # TODO: check is node is valid
        # TODO move node logic to node manager class
        _output, _err, _rc = self._execute.run_cmd("pcs status nodes")
        for status in _output.split("\n"):
            if node in status.split():
                node_rc = 0
                node_status = (status.split()[0])[:-1]
                Log.debug(
                    f"For {node} node rc: {node_rc}, status: {node_status}")
                return node_rc, node_status
        Log.debug(
            f"{node} is not detected in cluster, treating as disconnected node"
        )
        return 1, const.NODE_DISCONNECTED

    def remove_node(self, node):
        """
        Remove node from pcs cluster
        """
        # TODO: Limitation for node remove (in cluster node cannot remove it self)
        # Check if node already removed
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output before remove node: {_output}, {_err}, {_rc}"
        )
        _rc, status = self.node_status(node)
        if _rc != 1:
            self._execute.run_cmd(f"pcs cluster node remove {node} --force")
            _rc, status = self.node_status(node)
            Log.debug(f"For node {node} status: {status}, rc: {_rc}")
            if _rc != 1:
                Log.error(f"Failed to remove {node}")
                raise Exception(f"Failed to remove {node}")
            else:
                Log.info(f"Node {node} removed from cluster")
        else:
            Log.info(f"Node {node} already removed from cluster")
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output after remove node: {_output}, {_err}, {_rc}"
        )

    def add_node(self, node):
        """
        Add new node to pcs cluster
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output before add node: {_output}, {_err}, {_rc}")
        # TODO: Limitation for node add (in cluster node cannot add it self)
        commands = [
            f"pcs cluster node add {node}", f"pcs cluster enable {node}",
            f"pcs cluster start {node}", "pcs resource cleanup --all"
        ]
        _rc, status = self.node_status(node)
        if _rc != 0:
            for command in commands:
                _output, _err, _rc = self._execute.run_cmd(command)
                Log.info(f"{command} : {_output}, {_err}, {_rc}")
                time.sleep(5)
            retries = 0
            add_node_flag = -1
            while retries < 12:
                _rc, status = self.node_status(node)
                Log.info(f"{node} status rc: {_rc}, status: {status}")
                if status == const.NODE_ONLINE:
                    Log.info(f"Node {node} added to cluster")
                    add_node_flag = 0
                    break
                elif status != const.NODE_DISCONNECTED:
                    add_node_flag = 1
                    Log.info(
                        f"Node {node} added to cluster but not Online check status again."
                    )
                retries += 1
                time.sleep(10)
            if add_node_flag == 1:
                Log.info(
                    f"Node {node} added to cluster but not in Online state.")
            elif add_node_flag == 0:
                Log.info(
                    f"Node {node} Successfully added to cluster and in Online state"
                )
            else:
                raise Exception(f"Failed to add {node} to cluster")
        else:
            Log.info(f"Node {node} already added to cluster")
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output after add node: {_output}, {_err}, {_rc}")

    def get_nodes_status(self):
        """
        Sample output of the const.PCS_STATUS_NODES command

        Pacemaker Nodes:
         Online: node1 node2
         Standby:
         Standby with resource(s) running:
         Maintenance:
         Offline:
        Pacemaker Remote Nodes:
         Online:
         Standby:
         Standby with resource(s) running:
         Maintenance:
         Offline:
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS_NODES,
                                                   check_error=False)

        self.active_nodes = self.standby_nodes = self.offline_nodes = False

        for status in _output.split("\n"):
            nodes = status.split(":", 1)
            # This break should be removed if pacemaker remote is also used in the cluster
            if nodes[0] == "Pacemaker Remote Nodes":
                break
            elif nodes[0] == " Online" and len(nodes[1].split()) > 0:
                self.active_nodes = True
            elif nodes[0] == " Standby" and len(nodes[1].split()) > 0:
                self.standby_nodes = True
            elif nodes[0] == " Standby with resource(s) running" and len(
                    nodes[1].split()) > 0:
                self.active_nodes = True
            elif nodes[0] == " Maintenance" and len(nodes[1].split()) > 0:
                self.active_nodes = True
            elif nodes[0] == "  Offline" and len(nodes[1].split()) > 0:
                self.offline_nodes = True

    def start(self):

        Log.debug("Executing cortxha cluster start")

        _output, _err, _rc = self._execute.run_cmd(const.PCS_CLUSTER_STATUS,
                                                   check_error=False)
        if _rc != 0:
            if (_err.find("No such file or directory: 'pcs'") != -1):
                Log.error("Cluster failed to start; pcs not installed")
                self._output.output(
                    "Cluster failed to start; pcs not installed")
                self._output.rc(1)
                raise Exception("Cluster failed to start; pcs not installed")
            # if cluster is not running; start cluster
            elif (_err.find("cluster is not currently running on this node") !=
                  -1):
                self._execute.run_cmd(const.PCS_CLUSTER_START,
                                      check_error=False)
                Log.info("cluster started ; waiting for nodes to come online ")
                # It takes nodes 30 seconds to come to their original state after cluster is started
                # observation on a 2 node cluster
                # wait for upto 100 sec for nodes to come to active states (online / maintenance mode)
                time.sleep(10)
                self.get_nodes_status()
                retries = 18
                while self.active_nodes == False and retries > 0:
                    time.sleep(5)
                    self.get_nodes_status()
                    retries -= 1

        else:
            #If cluster is running, but all nodes are  in Standby mode;
            #start the nodes
            self.get_nodes_status()
            if self.active_nodes == False:
                if self.standby_nodes == True:
                    # issue pcs cluster unstandby
                    _output, _err, _rc = self._execute.run_cmd(
                        const.PCS_CLUSTER_UNSTANDBY, check_error=False)

        # check cluster and node status
        _output, _err, _rc = self._execute.run_cmd(const.PCS_CLUSTER_STATUS,
                                                   check_error=False)
        if _rc != 0:
            # cluster could not be started.
            Log.error("Cluster failed to start")
            self._output.output("Cluster failed to start")
            self._output.rc(1)
            raise Exception("Cluster failed to start")
        else:
            # confirm that at least one node is active
            self.get_nodes_status()
            if self.active_nodes == False:
                # wait for 5 seconds and retry
                time.sleep(5)
                self.get_nodes_status()
                if self.active_nodes == False:
                    self._output.output("Cluster started; nodes not online")
                    self._output.rc(1)
                    raise Exception("Cluster started; nodes not online")

        Log.info("Cluster started successfully")

    def stop(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented("This feature is not supported...")

    def status(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented("This feature is not supported...")

    def shutdown(self):
        raise HAUnimplemented("This feature is not supported...")
class PcsClusterManager(ClusterManager):
    def __init__(self):
        """
        PcsCluster manage pacemaker/corosync cluster
        """
        super(PcsClusterManager, self).__init__()
        self._execute = SimpleCommand()
        self._decision_monitor = DecisionMonitor()
        # TODO: add node_manager class to handle query
        self._refresh_contex = PcsRefreshContex(self._decision_monitor)
        # TODO move node logic to node manager class
        self._node_status = [
            'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected'
        ]

    def process_request(self, action, args, output):
        """
        Generic method to handle process request

        Args:
            action ([string]): Take cluster action for each request.
            args ([dict]): Parameter pass to request to process.
        """
        # TODO Add validater
        if action == const.CLUSTER_COMMAND:
            if args.cluster_action in ["add_node", "remove_node"]:
                getattr(self, args.cluster_action)(args.node)
            else:
                getattr(self, args.cluster_action)()
        elif action == const.NODE_COMMAND:
            self._refresh_contex.process_request(action, args)
        elif action == const.BUNDLE_COMMAND:
            HABundle().process_request(action, args, output)
        else:
            raise HAUnimplemented("This feature is not supported...")

    def node_status(self, node):
        """
        Check node status
        If node not detected return rc as 1 else 0
        Node status:
         Online:
         Standby:
         Maintenance:
         Offline:
        """
        Log.debug(f"Check {node} node status")
        # TODO: check is node is valid
        # TODO move node logic to node manager class
        _output, _err, _rc = self._execute.run_cmd("pcs status nodes")
        for status in _output.split("\n"):
            if node in status.split():
                node_rc = 0
                node_status = (status.split()[0])[:-1]
                Log.debug(
                    f"For {node} node rc: {node_rc}, status: {node_status}")
                return node_rc, node_status
        Log.debug(
            f"{node} is not detected in cluster, treating as disconnected node"
        )
        return 1, const.NODE_DISCONNECTED

    def remove_node(self, node):
        """
        Remove node from pcs cluster
        """
        # TODO: Limitation for node remove (in cluster node cannot remove it self)
        # Check if node already removed
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output before remove node: {_output}, {_err}, {_rc}"
        )
        _rc, status = self.node_status(node)
        if _rc != 1:
            self._execute.run_cmd(f"pcs cluster node remove {node} --force")
            _rc, status = self.node_status(node)
            Log.debug(f"For node {node} status: {status}, rc: {_rc}")
            if _rc != 1:
                Log.error(f"Failed to remove {node}")
                raise Exception(f"Failed to remove {node}")
            else:
                Log.info(f"Node {node} removed from cluster")
        else:
            Log.info(f"Node {node} already removed from cluster")
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output after remove node: {_output}, {_err}, {_rc}"
        )

    def add_node(self, node):
        """
        Add new node to pcs cluster
        """
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output before add node: {_output}, {_err}, {_rc}")
        # TODO: Limitation for node add (in cluster node cannot add it self)
        commands = [
            f"pcs cluster node add {node}", f"pcs cluster enable {node}",
            f"pcs cluster start {node}", "pcs resource cleanup --all"
        ]
        _rc, status = self.node_status(node)
        if _rc != 0:
            for command in commands:
                _output, _err, _rc = self._execute.run_cmd(command)
                Log.info(f"{command} : {_output}, {_err}, {_rc}")
                time.sleep(5)
            retries = 0
            add_node_flag = -1
            while retries < 12:
                _rc, status = self.node_status(node)
                Log.info(f"{node} status rc: {_rc}, status: {status}")
                if status == const.NODE_ONLINE:
                    Log.info(f"Node {node} added to cluster")
                    add_node_flag = 0
                    break
                elif status != const.NODE_DISCONNECTED:
                    add_node_flag = 1
                    Log.info(
                        f"Node {node} added to cluster but not Online check status again."
                    )
                retries += 1
                time.sleep(10)
            if add_node_flag == 1:
                Log.info(
                    f"Node {node} added to cluster but not in Online state.")
            elif add_node_flag == 0:
                Log.info(
                    f"Node {node} Successfully added to cluster and in Online state"
                )
            else:
                raise Exception(f"Failed to add {node} to cluster")
        else:
            Log.info(f"Node {node} already added to cluster")
        _output, _err, _rc = self._execute.run_cmd(const.PCS_STATUS)
        Log.info(
            f"Cluster status output after add node: {_output}, {_err}, {_rc}")

    def start(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented("This feature is not supported...")

    def stop(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented("This feature is not supported...")

    def status(self):
        # TODO Add wrapper to hctl pcswrap
        raise HAUnimplemented("This feature is not supported...")

    def shutdown(self):
        raise HAUnimplemented("This feature is not supported...")