Exemplo n.º 1
0
 def get_machine_id():
     execute = SimpleCommand()
     command = "cat /etc/machine-id"
     machine_id, err, rc = execute.run_cmd(command, check_error=True)
     Log.info(
         f"Read machine-id. Output: {machine_id}, Err: {err}, RC: {rc}")
     return machine_id.strip()
Exemplo n.º 2
0
 def __init__(self):
     """
     Initialize DynamicFidServiceRA class.
     """
     super(DynamicFidServiceRA, self).__init__()
     self._execute = SimpleCommand()
     self._status_list: list = ["failed", "active", "unknown"]
Exemplo n.º 3
0
 def __init__(self):
     """
     Initialize pcs controller
     """
     super(PcsController, self).__init__()
     self._execute = SimpleCommand()
     self._confstore = ConfigManager.get_confstore()
Exemplo n.º 4
0
 def __init__(self):
     """
     Init IEM generator
     """
     self._execute = SimpleCommand()
     with open(IEM_SCHEMA, 'r') as iem_schema_file:
         self.iem_alert_data = json.load(iem_schema_file)
Exemplo n.º 5
0
 def __init__(self):
     """
     Initialize IPMI Fencing Agent class.
     """
     super(IpmiFencingAgent, self).__init__()
     self._confstore = ConfigManager.get_confstore()
     self._execute = SimpleCommand()
Exemplo n.º 6
0
 def del_attr(resource: str):
     try:
         executor = SimpleCommand()
         executor.run_cmd(f"attrd_updater -U 0 -n {resource}",
                          check_error=False)
     except Exception as e:
         Log.error(
             f"Problem in deleting attr - resource: {resource}, Error: {e}")
Exemplo n.º 7
0
 def __init__(self, args: dict):
     """
     Init method.
     """
     self._url = args.config
     Conf.load(self._index, self._url)
     self._args = args.args
     self._execute = SimpleCommand()
     self._confstore = ConfigManager._get_confstore()
     self._cluster_manager = None
Exemplo n.º 8
0
def check_cluster_health() -> None:
    """ Check cluster status and make sure cluster is healthy """
    # Check if cluster running
    _, _, rc = SimpleCommand().run_cmd(PCS_CLUSTER_STATUS, check_error=False)
    if rc != 0:
        raise UpgradeError("Cluster is not running on current node")
    output, _, _ = SimpleCommand().run_cmd(PCS_FAILCOUNT_STATUS)
    if "INFINITY" in output:
        raise UpgradeError(
            f"Cluster is not stable, some resource are not healthy. {output}")
    def update_attr(scope: str, resource: str, instances_per_node: int, node_name:str = None):
        if scope == AttribScope.CLUSTER:
            node_name = None

        try:
            count: int = AttribUpdater.get_count(resource, instances_per_node, node_name)
            if count >= 0:
                attrib_name: str = resource + "-count"
                executor = SimpleCommand()
                executor.run_cmd(f"attrd_updater -U {count} -n {attrib_name}", check_error=False)
        except Exception as e:
            Log.error(f"Problem in updating attr - count of resource: {resource}. Error: {e}")
def s3servers(cib_xml, instance, push=False):
    """Create resources that belong to s3server group and clone the group.

    S3 background consumer is ordered after s3server and co-located with it.
    """
    for i in range(1, int(instance) + 1):
        cmd_s3server = f"pcs -f {cib_xml} resource create s3server-{i} ocf:seagate:dynamic_fid_service_ra service=s3server fid_service_name=s3service --group io_group --force"
        SimpleCommand().run_cmd(cmd_s3server)
    cmd_s3bc = f"pcs -f {cib_xml} resource create s3backcons systemd:s3backgroundconsumer meta failure-timeout=300s --group io_group"
    SimpleCommand().run_cmd(cmd_s3bc)

    if push:
        cib_push(cib_xml)
Exemplo n.º 11
0
    def get_count(resource: str, instances_per_node: int, node_name: str = None):
        count = 0
        try:
            executor = SimpleCommand()
            for instance in range(1, instances_per_node+1):
                service = resource + "-" + str(instance)
                out, _, _ = executor.run_cmd(f"attrd_updater -Q -A -n {service}", check_error=False)
                count += AttribUpdater._get_count_from_output(out, node_name)
        except Exception as e:
            Log.error(f"Problem in fetching attr. resource: {resource}. Error: {e}")
            count = -1

        return count
def uds(cib_xml, push=False):
    """Create uds resource and constraints."""
    cmd_uds = f"pcs -f {cib_xml} resource create uds systemd:uds op monitor interval=30s"
    SimpleCommand().run_cmd(cmd_uds)
    constraints = [
        f"pcs -f {cib_xml} constraint colocation add uds with csm-agent score=INFINITY",
        # According to EOS-9258, there is a bug which requires UDS to be started after csm_agent
        f"pcs -f {cib_xml} constraint order csm-agent then uds"
    ]
    for c in constraints:
        SimpleCommand().run_cmd(c)
    if push:
        cib_push(cib_xml)
Exemplo n.º 13
0
 def __init__(self):
     """
     PcsCluster manage pacemaker/corosync cluster
     """
     super(PcsClusterManager, self).__init__()
     self._execute = SimpleCommand()
     self._decision_monitor = DecisionMonitor()
     # TODO: add node_manager class to handle query
     self._refresh_contex = PcsRefreshContex(self._decision_monitor)
     # TODO move node logic to node manager class
     self._node_status = [
         'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected'
     ]
Exemplo n.º 14
0
def _get_resource_list() -> list:
    """Get list of resource"""
    resources: list = []
    # clear history before getting list of resource
    SimpleCommand().run_cmd(PCS_RESOURCE_REFRESH)
    output, _, _ = SimpleCommand().run_cmd(LIST_PCS_RESOURCES,
                                           check_error=False)
    if "NO resources" in output:
        return resources
    for resource in output.split("\n"):
        res = resource.split(":")[0]
        if res != "" and res not in resources:
            resources.append(res)
    return resources
def free_space_monitor(cib_xml, push=False):
    """Create free space monitor resource. 1 per cluster, no affinity."""
    cmd_fsm = f"pcs -f {cib_xml} resource create motr-free-space-mon systemd:motr-free-space-monitor op monitor interval=30s meta failure-timeout=300s"
    SimpleCommand().run_cmd(cmd_fsm)

    constraints = [
        f"pcs -f {cib_xml} constraint order motr-ios-1 then motr-free-space-mon",
        f"pcs -f {cib_xml} constraint colocation add motr-free-space-mon with motr-ios-1"
    ]
    for c in constraints:
        SimpleCommand().run_cmd(c)

    if push:
        cib_push(cib_xml)
Exemplo n.º 16
0
class NodeAlertMonitor(AlertMonitor):

    def __init__(self):
        """
        Init node alert monitor
        """
        super(NodeAlertMonitor, self).__init__()
        self.process = SimpleCommand()

    def _get_online_nodes(self):
        """
        Get list of online nodes ids.
        """
        online_nodes_xml = self.process.run_cmd(const.GET_ONLINE_NODES_CMD)
        # create element tree object
        root = ET.fromstring(online_nodes_xml[0])
        nodes_ids = []
        # iterate news items
        for item in root.findall('nodes'):
            # iterate child elements of item
            for child in item:
                if child.attrib['online'] == 'true':
                    nodes_ids.append(child.attrib['id'])
        Log.info(f"List of online nodes ids in cluster in sorted ascending order: {sorted(nodes_ids)}")
        return sorted(nodes_ids)

    def _get_local_node(self):
        """
        Get Local node name and id.
        """
        local_node_id = self.process.run_cmd(const.GET_LOCAL_NODE_ID_CMD)
        local_node_name = self.process.run_cmd(const.GET_LOCAL_NODE_NAME_CMD)
        Log.info(f"Local node name: {local_node_name[0]} \n Local node id: {local_node_id[0]}")
        return local_node_id[0], local_node_name[0]

    def process_alert(self):
        Log.debug("Processing event for NodeAlertMonitor")
        # Environment variable are available in self.crm_env
        self.iem = IemGenerator()
        # Get online nodeids from corosync.
        nodes_ids = self._get_online_nodes()
        local_node_id, local_node_name = self._get_local_node()
        # Generate and send IEM only through the highest online node in cluster.
        if nodes_ids[-1].strip() == local_node_id.strip():
            self.iem.generate_iem(self.crm_env["CRM_alert_node"], self.alert_event_module, self.alert_event_type)
            Log.info(f"Sent IEM alert from the node - name: {local_node_name}, id: {local_node_id}")
        else:
            Log.debug(
                f"This node does not have highest id. Local node id : {local_node_id}, all nodes: {nodes_ids.sort()}.")
def haproxy(cib_xml, push=False):
    """Create haproxy clone resource in pacemaker."""
    cmd_haproxy = f"pcs -f {cib_xml} resource create haproxy systemd:haproxy op monitor interval=30 --group io_group"
    SimpleCommand().run_cmd(cmd_haproxy)

    if push:
        cib_push(cib_xml)
def s3auth(cib_xml, push=False):
    """Create haproxy S3 auth server resource in pacemaker."""
    cmd_s3auth = f"pcs -f {cib_xml} resource create s3auth systemd:s3authserver clone op monitor interval=30"
    SimpleCommand().run_cmd(cmd_s3auth)

    if push:
        cib_push(cib_xml)
Exemplo n.º 19
0
def cluster_create(cluster_name, nodelist, enable=True, put_standby=True):
    """
    Create cluster on given nodes. Enables and starts cluster if needed.

    Parameters:
        cluster_name    - name of the cluster to be created
        nodelist        - List with nodes where setup the cluster
        enable          - whether cluster service shall start on boot
        put_standby     - whether [nodeslist] shall be put to standby mode

    Returns: None.

    Exceptions:
        ClusterCreateError: generic exception to catch all creation-related
        problems.
        ClusterSetupError: failure happened during setup operation.
    """
    nodes = " ".join(nodelist)
    cmd_setup = f"pcs cluster setup --start --name {cluster_name} {nodes}"
    cmd_standby = f"pcs node standby {nodes}"
    cmd_stonith = "pcs property set stonith-enabled=False"
    cmd_enable = f"pcs cluster enable {nodes}"

    cmdlist = [cmd_setup]
    if enable:
        cmdlist.append(cmd_enable)
    if put_standby:
        cmdlist.append(cmd_standby)
    cmdlist.append(cmd_stonith)
    try:
        for s in cmdlist:
            SimpleCommand().run_cmd(s)
    except Exception:
        raise ClusterSetupError("Failed to setup the cluster")
def sspl(cib_xml, push=False):
    """Create sspl clone resource in pacemaker."""
    # Using sspl-ll service file according to the content of SSPL repo
    cmd_sspl = f"pcs -f {cib_xml} resource create sspl-ll systemd:sspl-ll clone op monitor interval=30"
    SimpleCommand().run_cmd(cmd_sspl)

    if push:
        cib_push(cib_xml)
Exemplo n.º 21
0
 def __init__(self, args: dict):
     """
     Init method.
     """
     self._url = args.config
     Conf.load(self._index, self._url)
     self._args = args.args
     self._execute = SimpleCommand()
Exemplo n.º 22
0
class IemGenerator:
    '''
    Module responsible for constrcting an IEC and sending it to syslog
    '''
    def __init__(self):
        """
        Init IEM generator
        """
        self._execute = SimpleCommand()
        with open(IEM_SCHEMA, 'r') as iem_schema_file:
            self.iem_alert_data = json.load(iem_schema_file)

    def generate_iem(self, node: str, module: str, event_type: str) -> None:
        '''
           Forms an IEC based on diffrent values such as module:<node/resource>
           and event_type<lost/member for a node scenario>
           IEC code:
           IEC:{severity}{source}{component}{module_id}{event_id}:{desciption}
           severity of the event.
           source (Hardware or Software) of the event
           component who is generating an IEM
           event_id: unique identification of the event (like node lost or node now became member)
           module: sub-component of the module who generated an IEM
           Ex:
           IEC:WS0080010001: node is down(node lost)
           IEC:IS0080010002: node is up(node is now member)

           Required parameters
           node : Node name
           module : Module type (ex 'node' or 'resource' )
           event_type : Type of event based on module ( ex 'member' / 'lost' when module is 'node' )
        '''
        try:
            module_type = self.iem_alert_data.get(module)
            severity = module_type.get('severity').get(event_type)
            source = module_type.get('source')
            component = module_type.get('component')
            module_id = module_type.get('module')
            event_id = module_type.get('event').get(event_type).get('ID')
            desc = module_type.get('event').get(event_type).get('desc')
            desciption = re.sub("\$host", node, desc)
            desciption = re.sub("\$status", event_type, desciption)
            iec_string = f'"IEC:{severity}{source}{component}{module_id}{event_id}:{desciption}"'
            iec_command = ALERTS.logger_utility_iec_cmd + ' ' + iec_string
            Log.info(f'Sending an IEC: {iec_string} to syslog')

            _output, _err, _rc = self._execute.run_cmd(iec_command,
                                                       check_error=False)
            if _rc != 0 or _err:
                raise Exception(f'Failed to populate an IEC to syslog: {_err}')
        except KeyError as kerr:
            Log.error(
                f'Key Error occured while parsing the IEM data while generating \
                        an IEC for {module} for the event {event_type}: {kerr}'
            )
        except Exception as err:
            Log.error(f'Problem occured while generating an IEC for {module} \
                        for the event {event_type}: {err}')
Exemplo n.º 23
0
    def __init__(self):
        """
        PcsCluster manage pacemaker/corosync cluster
        """
        super(PcsClusterManager, self).__init__()
        self._execute = SimpleCommand()

        # get version from ha.conf
        self._version = ConfigManager.get_major_version()

        if self._version == const.CORTX_VERSION_1:
            self._decision_monitor = DecisionMonitor()
            # TODO: add node_manager class to handle query
            self._refresh_contex = PcsRefreshContex(self._decision_monitor)
            # TODO move node logic to node manager class
            self._node_status = [
                'Online', 'Standby', 'Maintenance', 'Offline', 'Disconnected'
            ]
Exemplo n.º 24
0
def delete_resources() -> None:
    """
    Delete pacemaker resources.

    Exceptions:
        UpgradeError
    """
    try:
        resources = _get_resource_list()
        Log.info(f"Going to delete following resources: {resources}")
        for r in resources:
            Log.info(f"Deleting resource {r}")
            SimpleCommand().run_cmd(
                PCS_DELETE_RESOURCE.replace("<resource>", r))
        SimpleCommand().run_cmd(PCS_CLEANUP)
        Log.info("Wait 2 min till all resource deleted.")
        is_resource_deleted(120)
    except Exception as err:
        raise UpgradeError("Resource deletion failed")
Exemplo n.º 25
0
    def _get_pcs_status(self):
        """
            Get status of the cluster using "pcs status --full xml" command.
        """
        self._initialize_node_health()

        error = None
        try:
            self._output, error, rc = SimpleCommand().run_cmd(PcsConstants.PCS_STATUS_XML)
        except Exception:
            Log.info("Failed to run pcs status on current node.")
            rc = 1
        Log.info(f"pcs status : rc = {rc}, error = {error}")

        if rc != 0:
            self._output = self._get_pcs_status_remote()

        if self._output is not None:
            self._output = ElementTree.fromstring(self._output)
Exemplo n.º 26
0
 def __init__(self, args: dict):
     """
     Init method.
     """
     if args is not None:
         self._url = args.config
         self._service = args.services
         Conf.load(self._index, self._url)
         self._args = args.args
     self._confstore = None
     self._execute = SimpleCommand()
def mgmt_resources(cib_xml, push=False):
    """Create mandatory resources for mgmt stack."""
    kibana = f"pcs -f {cib_xml} resource create kibana systemd:kibana op monitor interval=30s"
    agent = f"pcs -f {cib_xml} resource create csm-agent systemd:csm_agent op monitor interval=30s"
    web = f"pcs -f {cib_xml} resource create csm-web systemd:csm_web op monitor interval=30s"

    for c in (kibana, agent, web):
        SimpleCommand().run_cmd(c)

    if push:
        cib_push(cib_xml)
def mgmt_vip(cib_xml, vip, iface, cidr=24, push=False):
    """Create mgmt Virtual IP resource."""
    cmd = f"pcs -f {cib_xml} resource create mgmt-vip ocf:heartbeat:IPaddr2 \
ip={vip} cidr_netmask={cidr} nic={iface} iflabel=v1 \
op start   interval=0s timeout=60s \
op monitor interval=5s timeout=20s \
op stop    interval=0s timeout=60s"

    SimpleCommand().run_cmd(cmd)

    if push:
        cib_push(cib_xml)
def mgmt_stack(cib_xml, mgmt_vip_cfg, with_uds=False, push=False):
    """Create Mgmt stack related resources.

    It also creates and defines management group to support colocation and
    ordering requirements.
    """
    mgmt_resources(cib_xml)

    if with_uds:
        uds(cib_xml)

    cmd = f"pcs -f {cib_xml} resource group add management kibana csm-agent csm-web"
    SimpleCommand().run_cmd(cmd)

    if mgmt_vip_cfg:
        mgmt_vip(cib_xml, **mgmt_vip_cfg, push=False)
        cmd_group = f"pcs -f {cib_xml} resource group add management kibana --before csm-agent"
        SimpleCommand().run_cmd(cmd_group)

    if push:
        cib_push(cib_xml)
Exemplo n.º 30
0
def _is_cluster_standby_on() -> None:
    '''Check if cluster is in standby mode. If not, make standby mode ON'''

    Log.info('Check cluster is in standby mode')
    value = SimpleCommand().run_cmd(CHECK_PCS_STANDBY_MODE)

    standby_value = value[0].split(' ')[3].strip('\n').split('=')

    if standby_value[1].lower() != 'on':
        Log.warn('cluster is not in standby mode.')
        Log.info('switching the cluster in standby mode for performing post upgrade routines')
        _switch_cluster_mode(PCS_CLUSTER_STANDBY)
    Log.info('#### All post-upgrade prerequisites are in place ####')