Пример #1
0
def stop_target(ha_label):
    '''
    Stop the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            # Group disable will disable all members of group regardless of current status
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable',
                 _group_name(ha_label)])
        else:
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s", ha_label)
        else:
            return agent_error("Failed to stop target {}".format(ha_label))
def start_lnet():
    '''
    Place lnet into the 'up' state.
    '''
    console_log.info("Starting LNet")

    # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount
    # should be removed when LU-1279 is fixed
    return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or
                             AgentShell.run_canned_error_message(["modprobe", "lustre"]))
Пример #3
0
def stop_lnet():
    """
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    """

    console_log.info("Stopping LNet")

    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"])
        or AgentShell.run_canned_error_message(
            ["lnetctl", "lnet", "unconfigure"]))
Пример #4
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
Пример #5
0
def configure_corosync2_stage_2(ring0_name, ring1_name, new_node_fqdn, mcast_port, pcs_password, create_cluster):
    """Process configuration including peers and negotiated multicast port, no IP address
    information required

    Note: "The pcs cluster setup command will automatically configure two_node: 1 in
    corosync.conf, so a two-node cluster will "just work". If you are using a different cluster
    shell, you will have to configure corosync.conf appropriately yourself." Therefore
    no-quorum-policy does not have to be set when setting up cluster with pcs.

    :param ring0_name:
    :param ring1_name:
    :param peer_fqdns:
    :param mcast_port:
    :return:
    """

    interfaces = [InterfaceInfo(CorosyncRingInterface(name=ring0_name, ringnumber=0,
                                                      mcastport=mcast_port), None, None),
                  InterfaceInfo(CorosyncRingInterface(name=ring1_name, ringnumber=1,
                                                      mcastport=mcast_port), None, None)]

    config_params = {
        'token': '17000',
        'fail_recv_const': '10',
        'transport': 'udp',
        'rrpmode': 'passive',
        'addr0': interfaces[0].corosync_iface.bindnetaddr,
        'addr1': interfaces[1].corosync_iface.bindnetaddr,
        'mcast0': interfaces[0].corosync_iface.mcastaddr,
        'mcast1': interfaces[1].corosync_iface.mcastaddr,
        'mcastport0': interfaces[0].corosync_iface.mcastport,
        'mcastport1': interfaces[1].corosync_iface.mcastport
    }

    # authenticate nodes in cluster
    authenticate_nodes_in_cluster_command = ['pcs', 'cluster', 'auth', new_node_fqdn,
                                             '-u', PCS_USER, '-p', pcs_password]

    # build command string for setup of cluster which will result in corosync.conf rather than
    # writing from template, note we don't start the cluster here as services are managed
    # independently
    if create_cluster:
        cluster_setup_command = ['pcs', 'cluster', 'setup', '--name', PCS_CLUSTER_NAME, '--force'] + [new_node_fqdn]
        for param in ['transport', 'rrpmode', 'addr0', 'mcast0', 'mcastport0', 'addr1', 'mcast1',
                      'mcastport1', 'token', 'fail_recv_const']:
            # pull this value from the dictionary using parameter keyword
            cluster_setup_command.extend(["--" + param, str(config_params[param])])
    else:
        cluster_setup_command = ['pcs', 'cluster', 'node', 'add', new_node_fqdn]

    return agent_ok_or_error(AgentShell.run_canned_error_message(authenticate_nodes_in_cluster_command) or
                             AgentShell.run_canned_error_message(cluster_setup_command))
Пример #6
0
def _remove_module(name, modules):
    try:
        m = modules[name]
    except KeyError:
        # It's not loaded, do nothing.
        return None

    console_log.info("Removing %d dependents of %s : %s" %
                     (len(m.dependents), name, m.dependents))
    while m.dependents:
        error = _remove_module(m.dependents.pop(), modules)

        if error:
            return error

    console_log.info("Removing %s" % name)

    error = AgentShell.run_canned_error_message(['rmmod', name])

    if error:
        return error

    modules.pop(name)
    for m in modules.values():
        if name in m.dependents:
            m.dependents.remove(name)

    return None
Пример #7
0
def stop_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Stopped'
        ])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s" % ha_label)
        else:
            return agent_error("failed to stop target %s" % ha_label)
Пример #8
0
def start_lnet():
    """
    Place lnet into the 'up' state.
    """
    console_log.info("Starting LNet")

    return AgentShell.run_canned_error_message(
        ["lnetctl", "lnet", "configure", "--all"])
def stop_lnet():
    '''
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    '''

    console_log.info("Stopping LNet")
    return agent_ok_or_error(_rmmod_deps("lnet", excpt=["ksocklnd", "ko2iblnd"]) or
                             AgentShell.run_canned_error_message(["lctl", "net", "down"]))
Пример #10
0
def unload_lnet():
    """
    Unload the lnet modules from memory including an modules that are dependent on the lnet
    module.

    Lnet must be stopped before unload_lnet is called.
    """
    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["lustre_rmmod"]))
Пример #11
0
    def set_rsc_default(name, value):
        '''

        :param name: attribute to set
        :param value: value to set
        :return: None if an error else a canned error message
        '''
        return AgentShell.run_canned_error_message([
            "crm_attribute", "--type", "rsc_defaults", "--attr-name", name,
            "--attr-value", value
        ])
Пример #12
0
def configure_corosync2_stage_1(mcast_port, pcs_password, fqdn=None):
    # need to use user "hacluster" which is created on install of "pcs" package,
    # WARNING: clear text password
    set_password_command = [
        "bash",
        "-c",
        "echo %s | passwd --stdin %s" % (pcs_password, PCS_USER),
    ]
    if fqdn is not None:
        error = AgentShell.run_canned_error_message(
            ["hostnamectl", "set-hostname", fqdn])
        if error:
            return agent_error(error)

    return agent_ok_or_error(
        AgentShell.run_canned_error_message(set_password_command) or
        firewall_control.add_rule(mcast_port, "udp", "corosync", persist=True)
        or firewall_control.add_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True)
        or pcsd_service.start() or corosync_service.enable()
        or pcsd_service.enable())
Пример #13
0
def change_mcast_port(old_mcast_port, new_mcast_port):
    """
    Update corosync configuration with a new mcast_port on this managed server (not all the nodes in the cluster)
    Corosync will read the updated value in the configuration file, which it is polling for updates.

    Return: Value using simple return protocol
    """
    file_edit_args = ['sed', '-i.bak', 's/mcastport:.*/mcastport: %s/g' % new_mcast_port, COROSYNC_CONF_PATH]

    return agent_ok_or_error(firewall_control.remove_rule(old_mcast_port, "udp", "corosync", persist=True) or
                             firewall_control.add_rule(new_mcast_port, "udp", "corosync", persist=True) or
                             AgentShell.run_canned_error_message(file_edit_args))
Пример #14
0
def configure_corosync2_stage_1(mcast_port, pcs_password):
    # need to use user "hacluster" which is created on install of "pcs" package,
    # WARNING: clear text password
    set_password_command = ['bash', '-c', 'echo %s | passwd --stdin %s' %
                                          (pcs_password,
                                           PCS_USER)]

    return agent_ok_or_error(AgentShell.run_canned_error_message(set_password_command) or
                             firewall_control.add_rule(mcast_port, "udp", "corosync", persist=True) or
                             firewall_control.add_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True) or
                             pcsd_service.start() or
                             corosync_service.enable() or
                             pcsd_service.enable())
Пример #15
0
def load_lnet():
    '''
    Load the lnet modules from disk into memory including an modules using the modprobe command.
    '''
    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["modprobe", "lnet"]))
Пример #16
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''

    if not _resource_exists(ha_label):
        return agent_error("Target {} does not exist".format(ha_label))

    # if resource already started but not on primary, move it
    location = get_resource_location(ha_label)
    primary = _find_resource_constraint(ha_label, True)
    if location:
        if location != primary:
            console_log.info(
                "Resource %s already started, moving to primary node %s",
                ha_label, primary)
            error = _move_target(ha_label, primary)
            if error:
                return agent_error(error)
            location = primary
        return agent_result(location)

    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message(
            ['pcs', 'resource', 'enable', ha_label])
        if error:
            return agent_error(error)
        if _resource_exists(_zfs_name(ha_label)):
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'enable',
                 _zfs_name(ha_label)])
            if error:
                return agent_error(error)
        if _resource_exists(_group_name(ha_label)):
            # enable group also, in case group was disabled
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'enable',
                 _group_name(ha_label)])
            if error:
                return agent_error(error)

        # now wait for it to start
        if _wait_target(ha_label, True):
            location = get_resource_location(ha_label)
            if not location:
                return agent_error(
                    "Started {} but now can't locate it!".format(ha_label))
            return agent_result(location)

        else:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s", ha_label)
            else:
                return agent_error(
                    "Failed to start target {}".format(ha_label))