def stop_target(ha_label): ''' Stop the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target if _resource_exists(_zfs_name(ha_label)): # Group disable will disable all members of group regardless of current status error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', _group_name(ha_label)]) else: error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s", ha_label) else: return agent_error("Failed to stop target {}".format(ha_label))
def start_lnet(): ''' Place lnet into the 'up' state. ''' console_log.info("Starting LNet") # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount # should be removed when LU-1279 is fixed return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or AgentShell.run_canned_error_message(["modprobe", "lustre"]))
def stop_lnet(): """ Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state will be unloaded before lnet is stopped. """ console_log.info("Stopping LNet") return agent_ok_or_error( AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"]) or AgentShell.run_canned_error_message( ["lnetctl", "lnet", "unconfigure"]))
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def configure_corosync2_stage_2(ring0_name, ring1_name, new_node_fqdn, mcast_port, pcs_password, create_cluster): """Process configuration including peers and negotiated multicast port, no IP address information required Note: "The pcs cluster setup command will automatically configure two_node: 1 in corosync.conf, so a two-node cluster will "just work". If you are using a different cluster shell, you will have to configure corosync.conf appropriately yourself." Therefore no-quorum-policy does not have to be set when setting up cluster with pcs. :param ring0_name: :param ring1_name: :param peer_fqdns: :param mcast_port: :return: """ interfaces = [InterfaceInfo(CorosyncRingInterface(name=ring0_name, ringnumber=0, mcastport=mcast_port), None, None), InterfaceInfo(CorosyncRingInterface(name=ring1_name, ringnumber=1, mcastport=mcast_port), None, None)] config_params = { 'token': '17000', 'fail_recv_const': '10', 'transport': 'udp', 'rrpmode': 'passive', 'addr0': interfaces[0].corosync_iface.bindnetaddr, 'addr1': interfaces[1].corosync_iface.bindnetaddr, 'mcast0': interfaces[0].corosync_iface.mcastaddr, 'mcast1': interfaces[1].corosync_iface.mcastaddr, 'mcastport0': interfaces[0].corosync_iface.mcastport, 'mcastport1': interfaces[1].corosync_iface.mcastport } # authenticate nodes in cluster authenticate_nodes_in_cluster_command = ['pcs', 'cluster', 'auth', new_node_fqdn, '-u', PCS_USER, '-p', pcs_password] # build command string for setup of cluster which will result in corosync.conf rather than # writing from template, note we don't start the cluster here as services are managed # independently if create_cluster: cluster_setup_command = ['pcs', 'cluster', 'setup', '--name', PCS_CLUSTER_NAME, '--force'] + [new_node_fqdn] for param in ['transport', 'rrpmode', 'addr0', 'mcast0', 'mcastport0', 'addr1', 'mcast1', 'mcastport1', 'token', 'fail_recv_const']: # pull this value from the dictionary using parameter keyword cluster_setup_command.extend(["--" + param, str(config_params[param])]) else: cluster_setup_command = ['pcs', 'cluster', 'node', 'add', new_node_fqdn] return agent_ok_or_error(AgentShell.run_canned_error_message(authenticate_nodes_in_cluster_command) or AgentShell.run_canned_error_message(cluster_setup_command))
def _remove_module(name, modules): try: m = modules[name] except KeyError: # It's not loaded, do nothing. return None console_log.info("Removing %d dependents of %s : %s" % (len(m.dependents), name, m.dependents)) while m.dependents: error = _remove_module(m.dependents.pop(), modules) if error: return error console_log.info("Removing %s" % name) error = AgentShell.run_canned_error_message(['rmmod', name]) if error: return error modules.pop(name) for m in modules.values(): if name in m.dependents: m.dependents.remove(name) return None
def stop_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s" % ha_label) else: return agent_error("failed to stop target %s" % ha_label)
def start_lnet(): """ Place lnet into the 'up' state. """ console_log.info("Starting LNet") return AgentShell.run_canned_error_message( ["lnetctl", "lnet", "configure", "--all"])
def stop_lnet(): ''' Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state will be unloaded before lnet is stopped. ''' console_log.info("Stopping LNet") return agent_ok_or_error(_rmmod_deps("lnet", excpt=["ksocklnd", "ko2iblnd"]) or AgentShell.run_canned_error_message(["lctl", "net", "down"]))
def unload_lnet(): """ Unload the lnet modules from memory including an modules that are dependent on the lnet module. Lnet must be stopped before unload_lnet is called. """ return agent_ok_or_error( AgentShell.run_canned_error_message(["lustre_rmmod"]))
def set_rsc_default(name, value): ''' :param name: attribute to set :param value: value to set :return: None if an error else a canned error message ''' return AgentShell.run_canned_error_message([ "crm_attribute", "--type", "rsc_defaults", "--attr-name", name, "--attr-value", value ])
def configure_corosync2_stage_1(mcast_port, pcs_password, fqdn=None): # need to use user "hacluster" which is created on install of "pcs" package, # WARNING: clear text password set_password_command = [ "bash", "-c", "echo %s | passwd --stdin %s" % (pcs_password, PCS_USER), ] if fqdn is not None: error = AgentShell.run_canned_error_message( ["hostnamectl", "set-hostname", fqdn]) if error: return agent_error(error) return agent_ok_or_error( AgentShell.run_canned_error_message(set_password_command) or firewall_control.add_rule(mcast_port, "udp", "corosync", persist=True) or firewall_control.add_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True) or pcsd_service.start() or corosync_service.enable() or pcsd_service.enable())
def change_mcast_port(old_mcast_port, new_mcast_port): """ Update corosync configuration with a new mcast_port on this managed server (not all the nodes in the cluster) Corosync will read the updated value in the configuration file, which it is polling for updates. Return: Value using simple return protocol """ file_edit_args = ['sed', '-i.bak', 's/mcastport:.*/mcastport: %s/g' % new_mcast_port, COROSYNC_CONF_PATH] return agent_ok_or_error(firewall_control.remove_rule(old_mcast_port, "udp", "corosync", persist=True) or firewall_control.add_rule(new_mcast_port, "udp", "corosync", persist=True) or AgentShell.run_canned_error_message(file_edit_args))
def configure_corosync2_stage_1(mcast_port, pcs_password): # need to use user "hacluster" which is created on install of "pcs" package, # WARNING: clear text password set_password_command = ['bash', '-c', 'echo %s | passwd --stdin %s' % (pcs_password, PCS_USER)] return agent_ok_or_error(AgentShell.run_canned_error_message(set_password_command) or firewall_control.add_rule(mcast_port, "udp", "corosync", persist=True) or firewall_control.add_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True) or pcsd_service.start() or corosync_service.enable() or pcsd_service.enable())
def load_lnet(): ''' Load the lnet modules from disk into memory including an modules using the modprobe command. ''' return agent_ok_or_error( AgentShell.run_canned_error_message(["modprobe", "lnet"]))
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' if not _resource_exists(ha_label): return agent_error("Target {} does not exist".format(ha_label)) # if resource already started but not on primary, move it location = get_resource_location(ha_label) primary = _find_resource_constraint(ha_label, True) if location: if location != primary: console_log.info( "Resource %s already started, moving to primary node %s", ha_label, primary) error = _move_target(ha_label, primary) if error: return agent_error(error) location = primary return agent_result(location) # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', ha_label]) if error: return agent_error(error) if _resource_exists(_zfs_name(ha_label)): error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', _zfs_name(ha_label)]) if error: return agent_error(error) if _resource_exists(_group_name(ha_label)): # enable group also, in case group was disabled error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'enable', _group_name(ha_label)]) if error: return agent_error(error) # now wait for it to start if _wait_target(ha_label, True): location = get_resource_location(ha_label) if not location: return agent_error( "Started {} but now can't locate it!".format(ha_label)) return agent_result(location) else: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s", ha_label) else: return agent_error( "Failed to start target {}".format(ha_label))